This project is being developed by:
# Importing other important libraries
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, roc_curve, f1_score
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,cross_val_score,KFold,RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import resample
from SALib.analyze import sobol
from SALib.sample import saltelli
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
Opening our DataFrame
cd
C:\Users\kiran
df = pd.read_csv(r'C:\Users\kiran\Downloads\churn_data_bank\bank_data_train.csv')
DATA DESCRIPTION BEGINS
df
| ID | CR_PROD_CNT_IL | AMOUNT_RUB_CLO_PRC | PRC_ACCEPTS_A_EMAIL_LINK | APP_REGISTR_RGN_CODE | PRC_ACCEPTS_A_POS | PRC_ACCEPTS_A_TK | TURNOVER_DYNAMIC_IL_1M | CNT_TRAN_AUT_TENDENCY1M | SUM_TRAN_AUT_TENDENCY1M | ... | REST_DYNAMIC_CC_3M | MED_DEBT_PRC_YWZ | LDEAL_ACT_DAYS_PCT_TR3 | LDEAL_ACT_DAYS_PCT_AAVG | LDEAL_DELINQ_PER_MAXYWZ | TURNOVER_DYNAMIC_CC_3M | LDEAL_ACT_DAYS_PCT_TR | LDEAL_ACT_DAYS_PCT_TR4 | LDEAL_ACT_DAYS_PCT_CURR | TARGET | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 146841 | 0 | 0.000000 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | ... | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | 0 |
| 1 | 146842 | 0 | 0.041033 | NaN | NaN | NaN | NaN | 0.0 | 0.166667 | 0.186107 | ... | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | 0 |
| 2 | 146843 | 0 | 0.006915 | 0.0 | NaN | 0.0 | 0.0 | 0.0 | NaN | NaN | ... | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | 0 |
| 3 | 146844 | 0 | 0.000000 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | ... | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | 0 |
| 4 | 146845 | 0 | 0.000000 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | ... | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 355185 | 590822 | 0 | 0.000000 | 0.0 | NaN | 0.0 | 0.0 | 0.0 | 0.142857 | 0.123579 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 |
| 355186 | 590823 | 0 | 0.000000 | 0.0 | NaN | 0.0 | 0.0 | 0.0 | NaN | NaN | ... | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | 0 |
| 355187 | 590825 | 0 | 0.041298 | NaN | NaN | NaN | NaN | 0.0 | 0.089286 | 0.065293 | ... | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | 0 |
| 355188 | 590826 | 0 | 0.000000 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | ... | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | 0 |
| 355189 | 590828 | 0 | 0.000000 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | ... | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1 |
355190 rows × 116 columns
df.head()
| ID | CR_PROD_CNT_IL | AMOUNT_RUB_CLO_PRC | PRC_ACCEPTS_A_EMAIL_LINK | APP_REGISTR_RGN_CODE | PRC_ACCEPTS_A_POS | PRC_ACCEPTS_A_TK | TURNOVER_DYNAMIC_IL_1M | CNT_TRAN_AUT_TENDENCY1M | SUM_TRAN_AUT_TENDENCY1M | ... | REST_DYNAMIC_CC_3M | MED_DEBT_PRC_YWZ | LDEAL_ACT_DAYS_PCT_TR3 | LDEAL_ACT_DAYS_PCT_AAVG | LDEAL_DELINQ_PER_MAXYWZ | TURNOVER_DYNAMIC_CC_3M | LDEAL_ACT_DAYS_PCT_TR | LDEAL_ACT_DAYS_PCT_TR4 | LDEAL_ACT_DAYS_PCT_CURR | TARGET | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 146841 | 0 | 0.000000 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | ... | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | 0 |
| 1 | 146842 | 0 | 0.041033 | NaN | NaN | NaN | NaN | 0.0 | 0.166667 | 0.186107 | ... | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | 0 |
| 2 | 146843 | 0 | 0.006915 | 0.0 | NaN | 0.0 | 0.0 | 0.0 | NaN | NaN | ... | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | 0 |
| 3 | 146844 | 0 | 0.000000 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | ... | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | 0 |
| 4 | 146845 | 0 | 0.000000 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | ... | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | 0 |
5 rows × 116 columns
df.shape
(355190, 116)
Let us see how many columns have null values with their counts.
for column in df.columns:
print(f"{column}: {df[column].isnull().sum()}")
ID: 0 CR_PROD_CNT_IL: 0 AMOUNT_RUB_CLO_PRC: 38323 PRC_ACCEPTS_A_EMAIL_LINK: 200027 APP_REGISTR_RGN_CODE: 294640 PRC_ACCEPTS_A_POS: 200027 PRC_ACCEPTS_A_TK: 200027 TURNOVER_DYNAMIC_IL_1M: 0 CNT_TRAN_AUT_TENDENCY1M: 278078 SUM_TRAN_AUT_TENDENCY1M: 278078 AMOUNT_RUB_SUP_PRC: 38323 PRC_ACCEPTS_A_AMOBILE: 200027 SUM_TRAN_AUT_TENDENCY3M: 244138 CLNT_TRUST_RELATION: 285769 PRC_ACCEPTS_TK: 200027 PRC_ACCEPTS_A_MTP: 200027 REST_DYNAMIC_FDEP_1M: 0 CNT_TRAN_AUT_TENDENCY3M: 244138 CNT_ACCEPTS_TK: 200027 APP_MARITAL_STATUS: 286956 REST_DYNAMIC_SAVE_3M: 0 CR_PROD_CNT_VCU: 0 REST_AVG_CUR: 0 CNT_TRAN_MED_TENDENCY1M: 286223 APP_KIND_OF_PROP_HABITATION: 295829 CLNT_JOB_POSITION_TYPE: 310409 AMOUNT_RUB_NAS_PRC: 38323 CLNT_JOB_POSITION: 144379 APP_DRIVING_LICENSE: 297933 TRANS_COUNT_SUP_PRC: 38323 APP_EDUCATION: 287086 CNT_TRAN_CLO_TENDENCY1M: 288894 SUM_TRAN_MED_TENDENCY1M: 286223 PRC_ACCEPTS_A_ATM: 200027 PRC_ACCEPTS_MTP: 200027 TRANS_COUNT_NAS_PRC: 38323 APP_TRAVEL_PASS: 297933 CNT_ACCEPTS_MTP: 200027 CR_PROD_CNT_TOVR: 0 APP_CAR: 297934 CR_PROD_CNT_PIL: 0 SUM_TRAN_CLO_TENDENCY1M: 288894 APP_POSITION_TYPE: 294645 TURNOVER_CC: 0 TRANS_COUNT_ATM_PRC: 38323 AMOUNT_RUB_ATM_PRC: 38323 TURNOVER_PAYM: 0 AGE: 0 CNT_TRAN_MED_TENDENCY3M: 239313 CR_PROD_CNT_CC: 0 SUM_TRAN_MED_TENDENCY3M: 239313 REST_DYNAMIC_FDEP_3M: 0 REST_DYNAMIC_IL_1M: 0 APP_EMP_TYPE: 287828 SUM_TRAN_CLO_TENDENCY3M: 240292 LDEAL_TENOR_MAX: 347189 LDEAL_YQZ_CHRG: 353949 CR_PROD_CNT_CCFP: 0 DEAL_YQZ_IR_MAX: 347189 LDEAL_YQZ_COM: 353950 DEAL_YQZ_IR_MIN: 347189 CNT_TRAN_CLO_TENDENCY3M: 240292 REST_DYNAMIC_CUR_1M: 0 REST_AVG_PAYM: 0 LDEAL_TENOR_MIN: 347189 LDEAL_AMT_MONTH: 353302 APP_COMP_TYPE: 287828 LDEAL_GRACE_DAYS_PCT_MED: 0 REST_DYNAMIC_CUR_3M: 0 CNT_TRAN_SUP_TENDENCY3M: 156472 TURNOVER_DYNAMIC_CUR_1M: 0 REST_DYNAMIC_PAYM_3M: 0 SUM_TRAN_SUP_TENDENCY3M: 156472 REST_DYNAMIC_IL_3M: 0 CNT_TRAN_ATM_TENDENCY3M: 99595 CNT_TRAN_ATM_TENDENCY1M: 149316 TURNOVER_DYNAMIC_IL_3M: 0 SUM_TRAN_ATM_TENDENCY3M: 99595 DEAL_GRACE_DAYS_ACC_S1X1: 284741 AVG_PCT_MONTH_TO_PCLOSE: 353562 DEAL_YWZ_IR_MIN: 259477 SUM_TRAN_SUP_TENDENCY1M: 199195 DEAL_YWZ_IR_MAX: 259477 SUM_TRAN_ATM_TENDENCY1M: 149316 REST_DYNAMIC_PAYM_1M: 0 CNT_TRAN_SUP_TENDENCY1M: 199195 DEAL_GRACE_DAYS_ACC_AVG: 285757 TURNOVER_DYNAMIC_CUR_3M: 0 PACK: 0 MAX_PCLOSE_DATE: 353309 LDEAL_YQZ_PC: 352382 CLNT_SETUP_TENOR: 0 DEAL_GRACE_DAYS_ACC_MAX: 285757 TURNOVER_DYNAMIC_PAYM_3M: 0 LDEAL_DELINQ_PER_MAXYQZ: 347189 TURNOVER_DYNAMIC_PAYM_1M: 0 CLNT_SALARY_VALUE: 354478 TRANS_AMOUNT_TENDENCY3M: 51996 MED_DEBT_PRC_YQZ: 347189 TRANS_CNT_TENDENCY3M: 51996 LDEAL_USED_AMT_AVG_YQZ: 347189 REST_DYNAMIC_CC_1M: 0 LDEAL_USED_AMT_AVG_YWZ: 259477 TURNOVER_DYNAMIC_CC_1M: 0 AVG_PCT_DEBT_TO_DEAL_AMT: 353302 LDEAL_ACT_DAYS_ACC_PCT_AVG: 261742 REST_DYNAMIC_CC_3M: 0 MED_DEBT_PRC_YWZ: 259477 LDEAL_ACT_DAYS_PCT_TR3: 261742 LDEAL_ACT_DAYS_PCT_AAVG: 257015 LDEAL_DELINQ_PER_MAXYWZ: 259477 TURNOVER_DYNAMIC_CC_3M: 0 LDEAL_ACT_DAYS_PCT_TR: 261742 LDEAL_ACT_DAYS_PCT_TR4: 261742 LDEAL_ACT_DAYS_PCT_CURR: 261742 TARGET: 0
# Let us drop the data with more than 60 % of the null values to make our dataset more compatible the model.
threshold = len(df) * 0.6
df = df.dropna(thresh=threshold, axis=1)
print(df)
ID CR_PROD_CNT_IL AMOUNT_RUB_CLO_PRC TURNOVER_DYNAMIC_IL_1M \
0 146841 0 0.000000 0.0
1 146842 0 0.041033 0.0
2 146843 0 0.006915 0.0
3 146844 0 0.000000 0.0
4 146845 0 0.000000 0.0
... ... ... ... ...
355185 590822 0 0.000000 0.0
355186 590823 0 0.000000 0.0
355187 590825 0 0.041298 0.0
355188 590826 0 0.000000 0.0
355189 590828 0 0.000000 0.0
AMOUNT_RUB_SUP_PRC REST_DYNAMIC_FDEP_1M REST_DYNAMIC_SAVE_3M \
0 0.000000 0.0 0.541683
1 0.244678 0.0 0.000000
2 0.000000 0.0 0.000000
3 0.000000 0.0 0.005874
4 0.000000 0.0 0.000000
... ... ... ...
355185 0.000000 0.0 0.000000
355186 0.000000 0.0 0.000000
355187 0.095187 0.0 0.000000
355188 0.000000 0.0 0.000000
355189 0.000000 0.0 0.003551
CR_PROD_CNT_VCU REST_AVG_CUR AMOUNT_RUB_NAS_PRC ... \
0 0 156067.339767 0.000000 ...
1 0 4278.845817 0.000000 ...
2 0 112837.062817 0.000000 ...
3 0 42902.902883 0.000000 ...
4 0 71906.476533 0.000000 ...
... ... ... ... ...
355185 0 9697.620867 0.000000 ...
355186 0 428380.024733 0.262714 ...
355187 0 224884.436700 0.031179 ...
355188 0 12080.001833 0.282573 ...
355189 0 11323.728250 0.000000 ...
CLNT_SETUP_TENOR TURNOVER_DYNAMIC_PAYM_3M TURNOVER_DYNAMIC_PAYM_1M \
0 1.593023 0.0 0.0
1 1.587647 0.0 0.0
2 1.587647 0.0 0.0
3 1.583333 0.0 0.0
4 1.583333 0.0 0.0
... ... ... ...
355185 8.963872 0.0 0.0
355186 8.963872 0.0 0.0
355187 8.966560 0.0 0.0
355188 8.966560 0.0 0.0
355189 8.961184 0.0 0.0
TRANS_AMOUNT_TENDENCY3M TRANS_CNT_TENDENCY3M REST_DYNAMIC_CC_1M \
0 0.483032 0.406780 0.0
1 0.394340 0.545455 0.0
2 0.399342 0.297297 0.0
3 NaN NaN 0.0
4 0.611610 0.620690 0.0
... ... ... ...
355185 0.659039 0.785714 0.0
355186 0.652612 0.500000 0.0
355187 0.448386 0.459530 0.0
355188 1.000000 1.000000 0.0
355189 NaN NaN 0.0
TURNOVER_DYNAMIC_CC_1M REST_DYNAMIC_CC_3M TURNOVER_DYNAMIC_CC_3M \
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
4 0.0 0.0 0.0
... ... ... ...
355185 0.0 0.0 0.0
355186 0.0 0.0 0.0
355187 0.0 0.0 0.0
355188 0.0 0.0 0.0
355189 0.0 0.0 0.0
TARGET
0 0
1 0
2 0
3 0
4 0
... ...
355185 0
355186 0
355187 0
355188 0
355189 1
[355190 rows x 46 columns]
df.isnull().sum(axis=0)
ID 0 CR_PROD_CNT_IL 0 AMOUNT_RUB_CLO_PRC 38323 TURNOVER_DYNAMIC_IL_1M 0 AMOUNT_RUB_SUP_PRC 38323 REST_DYNAMIC_FDEP_1M 0 REST_DYNAMIC_SAVE_3M 0 CR_PROD_CNT_VCU 0 REST_AVG_CUR 0 AMOUNT_RUB_NAS_PRC 38323 TRANS_COUNT_SUP_PRC 38323 TRANS_COUNT_NAS_PRC 38323 CR_PROD_CNT_TOVR 0 CR_PROD_CNT_PIL 0 TURNOVER_CC 0 TRANS_COUNT_ATM_PRC 38323 AMOUNT_RUB_ATM_PRC 38323 TURNOVER_PAYM 0 AGE 0 CR_PROD_CNT_CC 0 REST_DYNAMIC_FDEP_3M 0 REST_DYNAMIC_IL_1M 0 CR_PROD_CNT_CCFP 0 REST_DYNAMIC_CUR_1M 0 REST_AVG_PAYM 0 LDEAL_GRACE_DAYS_PCT_MED 0 REST_DYNAMIC_CUR_3M 0 TURNOVER_DYNAMIC_CUR_1M 0 REST_DYNAMIC_PAYM_3M 0 REST_DYNAMIC_IL_3M 0 CNT_TRAN_ATM_TENDENCY3M 99595 TURNOVER_DYNAMIC_IL_3M 0 SUM_TRAN_ATM_TENDENCY3M 99595 REST_DYNAMIC_PAYM_1M 0 TURNOVER_DYNAMIC_CUR_3M 0 PACK 0 CLNT_SETUP_TENOR 0 TURNOVER_DYNAMIC_PAYM_3M 0 TURNOVER_DYNAMIC_PAYM_1M 0 TRANS_AMOUNT_TENDENCY3M 51996 TRANS_CNT_TENDENCY3M 51996 REST_DYNAMIC_CC_1M 0 TURNOVER_DYNAMIC_CC_1M 0 REST_DYNAMIC_CC_3M 0 TURNOVER_DYNAMIC_CC_3M 0 TARGET 0 dtype: int64
df.shape
(355190, 46)
# Dropping Id Column because it is of no use
df.drop('ID', axis=1, inplace=True)
Box plots and histograms to determine what method to use for filling null values.
# Filter columns with null values
columns_with_nulls = df.columns[df.isnull().any()]
# Set up subplots for box plots
num_rows = (len(columns_with_nulls) + 2) // 3
fig, axes = plt.subplots(num_rows, 3, figsize=(15, num_rows*5))
# Plot box plots for columns with null values
for i, column in enumerate(columns_with_nulls):
sns.boxplot(x=df[column], color='red', ax=axes[i // 3, i % 3])
axes[i // 3, i % 3].set_title(f'Box Plot of {column}')
axes[i // 3, i % 3].set_xlabel(column)
# Hide empty subplots
if len(columns_with_nulls) < num_rows * 3:
for j in range(len(columns_with_nulls), num_rows * 3):
axes[j // 3, j % 3].axis('off')
plt.tight_layout()
plt.show()
# Filter columns with null values
columns_with_nulls = df.columns[df.isnull().any()]
# Set up subplots for histograms
num_rows = (len(columns_with_nulls) + 2) // 3
fig, axes = plt.subplots(num_rows, 3, figsize=(15, num_rows*5))
# Plot histograms for columns with null values
for i, column in enumerate(columns_with_nulls):
df[column].hist(ax=axes[i // 3, i % 3], color='blue')
axes[i // 3, i % 3].set_title(f'Histogram of {column}')
axes[i // 3, i % 3].set_xlabel(column)
axes[i // 3, i % 3].set_ylabel('Frequency')
# Hide empty subplots
if len(columns_with_nulls) < num_rows * 3:
for j in range(len(columns_with_nulls), num_rows * 3):
axes[j // 3, j % 3].axis('off')
plt.tight_layout()
plt.show()
Here, we can see that there are a lot of outliers and we can use median to to fill all the null values in the following columns: - AMOUNT_RUB_CLO_PRC - AMOUNT_RUB_SUP_PRC - AMOUNT_RUB_NAS_PRC - AMOUNT_RUB_ATM_PRC Here, we can use mean to fill all the null values in the following column : - TRANS_COUNT_SUP_PRC - TRANS_COUNT_NAS_PRC - TRANS_COUNT_ATM_PRC Here, we can use mean to fill all the null values in the following column : - CNT_TRAN_ATM_TENDENCY3M - SUM_TRAN_ATM_TENDENCY3M Here, we can use mean to fill all the null values in the following column : - TRANS_AMOUNT_TENDENCY3M - TRANS_CNT_TENDENCY3M
median_clo_prc = df['AMOUNT_RUB_CLO_PRC'].median()
median_sup_prc = df['AMOUNT_RUB_SUP_PRC'].median()
median_nas_prc = df['AMOUNT_RUB_NAS_PRC'].median()
median_atm_prc = df['AMOUNT_RUB_ATM_PRC'].median()
# Fill null values with the respective median
df['AMOUNT_RUB_CLO_PRC'].fillna(median_clo_prc, inplace=True)
df['AMOUNT_RUB_SUP_PRC'].fillna(median_sup_prc, inplace=True)
df['AMOUNT_RUB_NAS_PRC'].fillna(median_nas_prc, inplace=True)
df['AMOUNT_RUB_ATM_PRC'].fillna(median_atm_prc, inplace=True)
mean_sup_prc = df['TRANS_COUNT_SUP_PRC'].mean()
mean_nas_prc = df['TRANS_COUNT_NAS_PRC'].mean()
mean_atm_prc = df['TRANS_COUNT_ATM_PRC'].mean()
# Fill null values with the respective mean
df['TRANS_COUNT_SUP_PRC'].fillna(mean_sup_prc, inplace=True)
df['TRANS_COUNT_NAS_PRC'].fillna(mean_nas_prc, inplace=True)
df['TRANS_COUNT_ATM_PRC'].fillna(mean_atm_prc, inplace=True)
mean_cnt_tran_atm_tendency3m = df['CNT_TRAN_ATM_TENDENCY3M'].mean()
mean_sum_tran_atm_tendency3m = df['SUM_TRAN_ATM_TENDENCY3M'].mean()
# Fill null values with the respective mean
df['CNT_TRAN_ATM_TENDENCY3M'].fillna(mean_cnt_tran_atm_tendency3m, inplace=True)
df['SUM_TRAN_ATM_TENDENCY3M'].fillna(mean_sum_tran_atm_tendency3m, inplace=True)
mean_trans_amount_tendency3m = df['TRANS_AMOUNT_TENDENCY3M'].mean()
mean_trans_cnt_tendency3m = df['TRANS_CNT_TENDENCY3M'].mean()
# Fill null values with the respective mean
df['TRANS_AMOUNT_TENDENCY3M'].fillna(mean_trans_amount_tendency3m, inplace=True)
df['TRANS_CNT_TENDENCY3M'].fillna(mean_trans_cnt_tendency3m, inplace=True)
df.isnull().sum()
CR_PROD_CNT_IL 0 AMOUNT_RUB_CLO_PRC 0 TURNOVER_DYNAMIC_IL_1M 0 AMOUNT_RUB_SUP_PRC 0 REST_DYNAMIC_FDEP_1M 0 REST_DYNAMIC_SAVE_3M 0 CR_PROD_CNT_VCU 0 REST_AVG_CUR 0 AMOUNT_RUB_NAS_PRC 0 TRANS_COUNT_SUP_PRC 0 TRANS_COUNT_NAS_PRC 0 CR_PROD_CNT_TOVR 0 CR_PROD_CNT_PIL 0 TURNOVER_CC 0 TRANS_COUNT_ATM_PRC 0 AMOUNT_RUB_ATM_PRC 0 TURNOVER_PAYM 0 AGE 0 CR_PROD_CNT_CC 0 REST_DYNAMIC_FDEP_3M 0 REST_DYNAMIC_IL_1M 0 CR_PROD_CNT_CCFP 0 REST_DYNAMIC_CUR_1M 0 REST_AVG_PAYM 0 LDEAL_GRACE_DAYS_PCT_MED 0 REST_DYNAMIC_CUR_3M 0 TURNOVER_DYNAMIC_CUR_1M 0 REST_DYNAMIC_PAYM_3M 0 REST_DYNAMIC_IL_3M 0 CNT_TRAN_ATM_TENDENCY3M 0 TURNOVER_DYNAMIC_IL_3M 0 SUM_TRAN_ATM_TENDENCY3M 0 REST_DYNAMIC_PAYM_1M 0 TURNOVER_DYNAMIC_CUR_3M 0 PACK 0 CLNT_SETUP_TENOR 0 TURNOVER_DYNAMIC_PAYM_3M 0 TURNOVER_DYNAMIC_PAYM_1M 0 TRANS_AMOUNT_TENDENCY3M 0 TRANS_CNT_TENDENCY3M 0 REST_DYNAMIC_CC_1M 0 TURNOVER_DYNAMIC_CC_1M 0 REST_DYNAMIC_CC_3M 0 TURNOVER_DYNAMIC_CC_3M 0 TARGET 0 dtype: int64
Let's check the overall info of our DataFrame
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 355190 entries, 0 to 355189 Data columns (total 45 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CR_PROD_CNT_IL 355190 non-null int64 1 AMOUNT_RUB_CLO_PRC 355190 non-null float64 2 TURNOVER_DYNAMIC_IL_1M 355190 non-null float64 3 AMOUNT_RUB_SUP_PRC 355190 non-null float64 4 REST_DYNAMIC_FDEP_1M 355190 non-null float64 5 REST_DYNAMIC_SAVE_3M 355190 non-null float64 6 CR_PROD_CNT_VCU 355190 non-null int64 7 REST_AVG_CUR 355190 non-null float64 8 AMOUNT_RUB_NAS_PRC 355190 non-null float64 9 TRANS_COUNT_SUP_PRC 355190 non-null float64 10 TRANS_COUNT_NAS_PRC 355190 non-null float64 11 CR_PROD_CNT_TOVR 355190 non-null int64 12 CR_PROD_CNT_PIL 355190 non-null int64 13 TURNOVER_CC 355190 non-null float64 14 TRANS_COUNT_ATM_PRC 355190 non-null float64 15 AMOUNT_RUB_ATM_PRC 355190 non-null float64 16 TURNOVER_PAYM 355190 non-null float64 17 AGE 355190 non-null int64 18 CR_PROD_CNT_CC 355190 non-null int64 19 REST_DYNAMIC_FDEP_3M 355190 non-null float64 20 REST_DYNAMIC_IL_1M 355190 non-null float64 21 CR_PROD_CNT_CCFP 355190 non-null int64 22 REST_DYNAMIC_CUR_1M 355190 non-null float64 23 REST_AVG_PAYM 355190 non-null float64 24 LDEAL_GRACE_DAYS_PCT_MED 355190 non-null float64 25 REST_DYNAMIC_CUR_3M 355190 non-null float64 26 TURNOVER_DYNAMIC_CUR_1M 355190 non-null float64 27 REST_DYNAMIC_PAYM_3M 355190 non-null float64 28 REST_DYNAMIC_IL_3M 355190 non-null float64 29 CNT_TRAN_ATM_TENDENCY3M 355190 non-null float64 30 TURNOVER_DYNAMIC_IL_3M 355190 non-null float64 31 SUM_TRAN_ATM_TENDENCY3M 355190 non-null float64 32 REST_DYNAMIC_PAYM_1M 355190 non-null float64 33 TURNOVER_DYNAMIC_CUR_3M 355190 non-null float64 34 PACK 355190 non-null object 35 CLNT_SETUP_TENOR 355190 non-null float64 36 TURNOVER_DYNAMIC_PAYM_3M 355190 non-null float64 37 TURNOVER_DYNAMIC_PAYM_1M 355190 non-null float64 38 TRANS_AMOUNT_TENDENCY3M 355190 non-null float64 39 TRANS_CNT_TENDENCY3M 355190 non-null float64 40 REST_DYNAMIC_CC_1M 355190 non-null float64 41 TURNOVER_DYNAMIC_CC_1M 355190 non-null float64 42 REST_DYNAMIC_CC_3M 355190 non-null float64 43 TURNOVER_DYNAMIC_CC_3M 355190 non-null float64 44 TARGET 355190 non-null int64 dtypes: float64(36), int64(8), object(1) memory usage: 121.9+ MB
# Count plot of 'PACK' column
sns.countplot(data=df, x='PACK')
plt.title('Count Plot of PACK')
plt.xlabel('PACK')
plt.ylabel('Count')
plt.show()
It shows us that there are different packages with different code names and we cannot drop it because it might be useful for our model.
# Histogram
sns.histplot(df['AGE'], kde=True, color='purple', bins=20)
plt.title('Histogram of AGE')
plt.xlabel('AGE')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
First of all, we can see that the age in months vary from 200 to over 1000. Furthermore, the age with highest frequency is of 400 months.
# Scatter plot
sns.scatterplot(x=df.index, y='TURNOVER_CC', data=df, color='green')
plt.title('Scatter Plot of TURNOVER_CC')
plt.xlabel('X Axis')
plt.ylabel('TURNOVER_CC')
plt.grid(True)
plt.tight_layout()
plt.show()
Here we can see the most of the average turnover in credit card is lesser than unit 0.5
# Plot histogram of 'CLNT_SETUP_TENOR'
plt.hist(df['CLNT_SETUP_TENOR'], bins=20, color='skyblue', edgecolor='black')
plt.title('Histogram of CLNT_SETUP_TENOR')
plt.xlabel('CLNT_SETUP_TENOR')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
The above histogram states the months of being a customer.
plt.figure(figsize=(8, 4))
# Scatter plot
sns.scatterplot(x='TURNOVER_CC', y='TURNOVER_PAYM', data=df, color='blue')
plt.title('Scatter Plot between TURNOVER_CC and TURNOVER_PAYM')
plt.xlabel('TURNOVERCC')
plt.ylabel('TURNOVER_PAYM')
plt.grid(True)
plt.show()
Here we can see that the most of the users which have turnover in credit card in range of 0 to 1 unit have the avg turnover of the salary account below 1 There looks like no real correlation, but we may keep as it could be beneficial with the other variables.
plt.figure(figsize=(8, 6))
# Scatter plot
sns.scatterplot(x='TRANS_AMOUNT_TENDENCY3M', y='TARGET', data=df, color='blue')
plt.title('Scatter Plot between TRANS_AMOUNT_TENDENCY3M and TARGET')
plt.xlabel('TRANS_AMOUNT_TENDENCY3M')
plt.ylabel('TARGET')
plt.grid(True)
plt.show()
There looks like no real correlation, but we may keep as it could be beneficial with the other variables.
plt.figure(figsize=(8, 6))
# Scatter plot
sns.scatterplot(x='TARGET', y='REST_AVG_PAYM', data=df, color='orange')
plt.title('Scatter Plot between TARGET and REST_AVG_PAYM')
plt.xlabel('TARGET')
plt.ylabel('REST_AVG_PAYM')
plt.grid(True)
plt.show()
There looks like no real correlation, but we may keep as it could be beneficial with the other variables.
df.shape
(355190, 45)
Since we have 45 features let's plot two correlation chart to get better visualization.
# Select only numeric columns
numeric_df = df.select_dtypes(include='number')
# Calculating correlation for numerical features
correlation = numeric_df.corr()
# Get the number of features
num_features = len(correlation)
# Split the features into two equal parts
half_features = num_features // 2
# Split the correlation matrix into two parts
correlation_1 = correlation.iloc[:half_features, :half_features]
correlation_2 = correlation.iloc[half_features:, half_features:]
# Create masks to hide the upper triangle
mask_1 = np.triu(np.ones_like(correlation_1, dtype=bool))
mask_2 = np.triu(np.ones_like(correlation_2, dtype=bool))
# Plotting heatmap for the first part of the correlation matrix
plt.figure(figsize=(20, 20))
sns.heatmap(correlation_1, mask=mask_1, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Chart (First Half of Features)')
plt.show()
# Plotting heatmap for the second part of the correlation matrix
plt.figure(figsize=(20, 20))
sns.heatmap(correlation_2, mask=mask_2, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Chart (Second Half of Features)')
plt.show()
# One hot encoding of "PACK"
encoder = OneHotEncoder()
ohreq = df[['PACK']]
oh_encoded = encoder.fit_transform(ohreq)
oh_df = pd.DataFrame(oh_encoded.toarray(), columns=encoder.get_feature_names_out(['PACK']))
oh_df.index = df.index
df = pd.concat([df, oh_df], axis=1)
# Dropping "PACK" feature afer doing its one hot encoding
df.drop('PACK', axis=1, inplace=True)
Creating new feature "total_turnover"
df['total_turnover'] = df['TURNOVER_DYNAMIC_IL_1M'] + df['TURNOVER_CC'] + df['TURNOVER_PAYM'] + df['TRANS_AMOUNT_TENDENCY3M']
Creating another new feature "TOTAL_PROD_CNT"
df['TOTAL_PROD_CNT'] = df['CR_PROD_CNT_IL'] + df['CR_PROD_CNT_VCU'] + df['CR_PROD_CNT_TOVR'] + df['CR_PROD_CNT_PIL'] + df['CR_PROD_CNT_CC'] + df['CR_PROD_CNT_CCFP']
# Define bin edges and label
bin_edges = [168, 300, 500, 700, 900, 1128]
bin_labels = ['Group 1', 'Group 2', 'Group 3', 'Group 4', 'Group 5']
# Bin the 'AGE' feature
df['AGE_GROUP'] = pd.cut(df['AGE'], bins=bin_edges, labels=bin_labels)
df.drop(columns=['AGE'], inplace=True)
df['TARGET'].value_counts()
TARGET 0 326265 1 28925 Name: count, dtype: int64
target_counts = df['TARGET'].value_counts()
# Plot the bar graph
plt.figure(figsize=(8, 6))
target_counts.plot(kind='bar', color=['blue', 'red'])
plt.title('Distribution of TARGET Variable')
plt.xlabel('TARGET')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()
We can clearly see that our TARGET variable is imbalanced. We will do undersampling a little later.
df.isnull().sum()
CR_PROD_CNT_IL 0 AMOUNT_RUB_CLO_PRC 0 TURNOVER_DYNAMIC_IL_1M 0 AMOUNT_RUB_SUP_PRC 0 REST_DYNAMIC_FDEP_1M 0 REST_DYNAMIC_SAVE_3M 0 CR_PROD_CNT_VCU 0 REST_AVG_CUR 0 AMOUNT_RUB_NAS_PRC 0 TRANS_COUNT_SUP_PRC 0 TRANS_COUNT_NAS_PRC 0 CR_PROD_CNT_TOVR 0 CR_PROD_CNT_PIL 0 TURNOVER_CC 0 TRANS_COUNT_ATM_PRC 0 AMOUNT_RUB_ATM_PRC 0 TURNOVER_PAYM 0 CR_PROD_CNT_CC 0 REST_DYNAMIC_FDEP_3M 0 REST_DYNAMIC_IL_1M 0 CR_PROD_CNT_CCFP 0 REST_DYNAMIC_CUR_1M 0 REST_AVG_PAYM 0 LDEAL_GRACE_DAYS_PCT_MED 0 REST_DYNAMIC_CUR_3M 0 TURNOVER_DYNAMIC_CUR_1M 0 REST_DYNAMIC_PAYM_3M 0 REST_DYNAMIC_IL_3M 0 CNT_TRAN_ATM_TENDENCY3M 0 TURNOVER_DYNAMIC_IL_3M 0 SUM_TRAN_ATM_TENDENCY3M 0 REST_DYNAMIC_PAYM_1M 0 TURNOVER_DYNAMIC_CUR_3M 0 CLNT_SETUP_TENOR 0 TURNOVER_DYNAMIC_PAYM_3M 0 TURNOVER_DYNAMIC_PAYM_1M 0 TRANS_AMOUNT_TENDENCY3M 0 TRANS_CNT_TENDENCY3M 0 REST_DYNAMIC_CC_1M 0 TURNOVER_DYNAMIC_CC_1M 0 REST_DYNAMIC_CC_3M 0 TURNOVER_DYNAMIC_CC_3M 0 TARGET 0 PACK_101 0 PACK_102 0 PACK_103 0 PACK_104 0 PACK_105 0 PACK_107 0 PACK_108 0 PACK_109 0 PACK_301 0 PACK_K01 0 PACK_M01 0 PACK_O01 0 total_turnover 0 TOTAL_PROD_CNT 0 AGE_GROUP 20 dtype: int64
We can see that after binning age we got some null values. Let's fix it.
plt.figure(figsize=(8, 6))
# Countplot
sns.countplot(x='AGE_GROUP', data=df, palette='Set2')
plt.title('Distribution of AGE_GROUP')
plt.xlabel('AGE_GROUP')
plt.ylabel('Count')
plt.grid(True)
plt.tight_layout()
plt.show()
mode_age = df['AGE_GROUP'].mode()
# Fill null values with the respective mean
df['AGE_GROUP'].fillna(mode_age, inplace=True)
Here the AGE_GROUP has null values and we can use mode to fill those as it is a categorical column.
# Using one hot encoding to create new features and coverting it into numerical column
encoder = OneHotEncoder()
ohreq = df[['AGE_GROUP']]
oh_encoded = encoder.fit_transform(ohreq)
oh_df = pd.DataFrame(oh_encoded.toarray(), columns=encoder.get_feature_names_out(['AGE_GROUP']))
oh_df.index = df.index
df = pd.concat([df, oh_df], axis=1)
for column in df.columns:
print(f"{column}: {df[column].isnull().sum()}")
CR_PROD_CNT_IL: 0 AMOUNT_RUB_CLO_PRC: 0 TURNOVER_DYNAMIC_IL_1M: 0 AMOUNT_RUB_SUP_PRC: 0 REST_DYNAMIC_FDEP_1M: 0 REST_DYNAMIC_SAVE_3M: 0 CR_PROD_CNT_VCU: 0 REST_AVG_CUR: 0 AMOUNT_RUB_NAS_PRC: 0 TRANS_COUNT_SUP_PRC: 0 TRANS_COUNT_NAS_PRC: 0 CR_PROD_CNT_TOVR: 0 CR_PROD_CNT_PIL: 0 TURNOVER_CC: 0 TRANS_COUNT_ATM_PRC: 0 AMOUNT_RUB_ATM_PRC: 0 TURNOVER_PAYM: 0 CR_PROD_CNT_CC: 0 REST_DYNAMIC_FDEP_3M: 0 REST_DYNAMIC_IL_1M: 0 CR_PROD_CNT_CCFP: 0 REST_DYNAMIC_CUR_1M: 0 REST_AVG_PAYM: 0 LDEAL_GRACE_DAYS_PCT_MED: 0 REST_DYNAMIC_CUR_3M: 0 TURNOVER_DYNAMIC_CUR_1M: 0 REST_DYNAMIC_PAYM_3M: 0 REST_DYNAMIC_IL_3M: 0 CNT_TRAN_ATM_TENDENCY3M: 0 TURNOVER_DYNAMIC_IL_3M: 0 SUM_TRAN_ATM_TENDENCY3M: 0 REST_DYNAMIC_PAYM_1M: 0 TURNOVER_DYNAMIC_CUR_3M: 0 CLNT_SETUP_TENOR: 0 TURNOVER_DYNAMIC_PAYM_3M: 0 TURNOVER_DYNAMIC_PAYM_1M: 0 TRANS_AMOUNT_TENDENCY3M: 0 TRANS_CNT_TENDENCY3M: 0 REST_DYNAMIC_CC_1M: 0 TURNOVER_DYNAMIC_CC_1M: 0 REST_DYNAMIC_CC_3M: 0 TURNOVER_DYNAMIC_CC_3M: 0 TARGET: 0 PACK_101: 0 PACK_102: 0 PACK_103: 0 PACK_104: 0 PACK_105: 0 PACK_107: 0 PACK_108: 0 PACK_109: 0 PACK_301: 0 PACK_K01: 0 PACK_M01: 0 PACK_O01: 0 total_turnover: 0 TOTAL_PROD_CNT: 0 AGE_GROUP: 20 AGE_GROUP_Group 1: 0 AGE_GROUP_Group 2: 0 AGE_GROUP_Group 3: 0 AGE_GROUP_Group 4: 0 AGE_GROUP_Group 5: 0 AGE_GROUP_nan: 0
df.drop('AGE_GROUP', axis=1,inplace=True)
df.drop('AGE_GROUP_nan', axis=1,inplace=True)
Let's specify the Target and Features.
# Target variable
y = df['TARGET']
# Features
X = df.drop(columns=['TARGET'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)
Now, let's do undersampling.
def perform_random_undersampling(X_train, y_train):
"""
Undersample the majority class using RandomUnderSampler.
Parameters:
X_train : DataFrame or array-like
Features of the training data.
y_train : Series or array-like
Target labels of the training data.
Returns:
X_train_undersampled : DataFrame or array-like
Undersampled features of the training data.
y_train_undersampled : Series or array-like
Undersampled target labels of the training data.
"""
# Initialize RandomUnderSampler
undersampler = RandomUnderSampler(random_state=42)
# Perform undersampling
X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)
return X_train_undersampled, y_train_undersampled
X_train_undersampled, y_train_undersampled = perform_random_undersampling(X_train, y_train)
# Before undersampling
print("Before undersampling:")
print(y_train.value_counts())
# After undersampling
print("\nAfter undersampling:")
print(y_train_undersampled.value_counts())
Before undersampling: TARGET 0 228299 1 20334 Name: count, dtype: int64 After undersampling: TARGET 0 20334 1 20334 Name: count, dtype: int64
# Initialize the StandardScaler object
scaler = StandardScaler()
# Fit and transform the training features
X_train_scaled = scaler.fit_transform(X_train_undersampled)
# Transform the test features using the parameters learned from the training data
X_test_scaled = scaler.transform(X_test)
X_train_scaled.shape
(40668, 61)
y_train_undersampled.shape
(40668,)
# Set up Logistic Regression model
log_reg = LogisticRegression()
# Set up parameter grid for GridSearchCV
param_grid = {'C': [0.1, 1, 10],'penalty': ['l1', 'l2']}
# Set up GridSearchCV with k-fold cross-validation
grid_search = GridSearchCV(log_reg, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
# Fit GridSearchCV to training data
grid_search.fit(X_train_scaled, y_train_undersampled)
GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
param_grid={'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
param_grid={'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
scoring='accuracy')LogisticRegression()
LogisticRegression()
# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
# Use best parameters to build the final model
final_model = LogisticRegression(**best_params)
final_model.fit(X_train_scaled, y_train_undersampled)
Best Parameters: {'C': 0.1, 'penalty': 'l2'}
LogisticRegression(C=0.1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(C=0.1)
# Predictions on the training set using the final Logistic Regression model
y_train_pred = final_model.predict(X_train_scaled)
# Probabilities for the positive class on the training set
y_train_prob = final_model.predict_proba(X_train_scaled)[:, 1]
# Compute the confusion matrix on the training set
cm_train = confusion_matrix(y_train_undersampled, y_train_pred)
# Compute accuracy, precision, recall, and F1 score on the training set
accuracy_train = (cm_train[0,0] + cm_train[1,1]) / np.sum(cm_train)
precision_train = precision_score(y_train_undersampled, y_train_pred)
recall_train = recall_score(y_train_undersampled, y_train_pred)
f1_train = f1_score(y_train_undersampled, y_train_pred)
# Compute the ROC AUC score on the training set
roc_auc_train = roc_auc_score(y_train_undersampled, y_train_prob)
# Compute ROC curve for training set
fpr_train, tpr_train, _ = roc_curve(y_train_undersampled, y_train_prob)
print("Confusion Matrix (Training Set):\n", cm_train)
print("Accuracy (Train):", accuracy_train)
print("Precision (Train):", precision_train)
print("Recall (Train):", recall_train)
print("F1 Score (Train):", f1_train)
print("ROC AUC Score (Train):", roc_auc_train)
Confusion Matrix (Training Set): [[12672 7662] [ 6004 14330]] Accuracy (Train): 0.6639618373168092 Precision (Train): 0.6516005820298291 Recall (Train): 0.7047309924264779 F1 Score (Train): 0.6771251712895148 ROC AUC Score (Train): 0.7298662909716576
X_test_scaled.shape
(106557, 61)
y_test.shape
(106557,)
# Predictions on the test set using the final Logistic Regression model
y_test_pred = final_model.predict(X_test_scaled)
# Probabilities for the positive class on the test set
y_test_prob = final_model.predict_proba(X_test_scaled)[:, 1]
# Compute the confusion matrix on the test set
cm_test = confusion_matrix(y_test, y_test_pred)
# Compute accuracy, precision, recall, and F1 score on the test set
accuracy_test = (cm_test[0,0] + cm_test[1,1]) / np.sum(cm_test)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)
# Compute the ROC AUC score on the test set
roc_auc_test = roc_auc_score(y_test, y_test_prob)
print("Confusion Matrix (Test Set):\n", cm_test)
print("Accuracy (Test):", accuracy_test)
print("Precision (Test):", precision_test)
print("Recall (Test):", recall_test)
print("F1 Score (Test):", f1_test)
print("ROC AUC Score (Test):", roc_auc_test)
Confusion Matrix (Test Set): [[60654 37312] [ 2577 6014]] Accuracy (Test): 0.6256557523203544 Precision (Test): 0.13880810598716706 Recall (Test): 0.700034920265394 F1 Score (Test): 0.23167748521678833 ROC AUC Score (Test): 0.7230050413871172
# Compute ROC curve for test set
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_prob)
# Plot ROC curve for both training and test sets
plt.figure(figsize=(10, 5))
plt.plot(fpr_train, tpr_train, color='blue', lw=2, label=f'Train ROC curve (AUC = {roc_auc_train:.2f})')
plt.plot(fpr_test, tpr_test, color='red', lw=2, label=f'Test ROC curve (AUC = {roc_auc_test:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
coefficients = final_model.coef_[0]
# Create a DataFrame to store feature names and their coefficients
feature_coefficients = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': coefficients})
# Sort the DataFrame by absolute coefficient values to identify the most important features
feature_coefficients['Absolute Coefficient'] = np.abs(feature_coefficients['Coefficient'])
feature_coefficients = feature_coefficients.sort_values(by='Absolute Coefficient', ascending=False)
# Print the top features
print("Top features:")
print(feature_coefficients.head(10)) # Adjust the number of top features as needed
# Plot feature coefficients
plt.figure(figsize=(20, 16))
plt.barh(feature_coefficients['Feature'], feature_coefficients['Coefficient'])
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Coefficients')
plt.show()
Top features:
Feature Coefficient Absolute Coefficient
7 REST_AVG_CUR -1.211265 1.211265
25 TURNOVER_DYNAMIC_CUR_1M 0.408601 0.408601
14 TRANS_COUNT_ATM_PRC 0.380209 0.380209
21 REST_DYNAMIC_CUR_1M -0.344919 0.344919
22 REST_AVG_PAYM -0.278476 0.278476
31 REST_DYNAMIC_PAYM_1M -0.236047 0.236047
35 TURNOVER_DYNAMIC_PAYM_1M 0.226137 0.226137
33 CLNT_SETUP_TENOR -0.222930 0.222930
24 REST_DYNAMIC_CUR_3M -0.155333 0.155333
9 TRANS_COUNT_SUP_PRC 0.141405 0.141405
# Define the input parameter space for sensitivity analysis
column_names = X_train.columns.tolist()
problem = {
'num_vars': len(column_names),
'names': column_names,
'bounds': [[-3, 3] for _ in range(len(column_names))]
}
# Sample the input parameter space using Saltelli's method
param_values = saltelli.sample(problem, 1000)
# Perform model runs with each parameter sample
Y = np.empty([param_values.shape[0]])
for i, X in enumerate(param_values):
# Predict using the model
Y[i] = final_model.predict(X.reshape(1, -1))
# Perform Sobol sensitivity analysis
sobol_results = sobol.analyze(problem, Y, print_to_console=True)
# You can print the sensitivity indices if needed
print("Sobol First Order Indices:", sobol_results['S1'])
print("Sobol Total Order Indices:", sobol_results['ST'])
ST ST_conf
CR_PROD_CNT_IL 0.028027 0.014524
AMOUNT_RUB_CLO_PRC 0.020019 0.013543
TURNOVER_DYNAMIC_IL_1M 0.030029 0.013766
AMOUNT_RUB_SUP_PRC 0.020019 0.012655
REST_DYNAMIC_FDEP_1M 0.024023 0.013705
... ... ...
AGE_GROUP_Group 1 0.024023 0.011789
AGE_GROUP_Group 2 0.006006 0.007348
AGE_GROUP_Group 3 0.030029 0.014610
AGE_GROUP_Group 4 0.002002 0.003899
AGE_GROUP_Group 5 0.016015 0.008952
[61 rows x 2 columns]
S1 S1_conf
CR_PROD_CNT_IL 0.003882 0.013891
AMOUNT_RUB_CLO_PRC -0.000244 0.011314
TURNOVER_DYNAMIC_IL_1M -0.005823 0.013952
AMOUNT_RUB_SUP_PRC -0.000244 0.013105
REST_DYNAMIC_FDEP_1M 0.004004 0.012928
... ... ...
AGE_GROUP_Group 1 0.000244 0.012042
AGE_GROUP_Group 2 -0.001941 0.006317
AGE_GROUP_Group 3 -0.006067 0.015855
AGE_GROUP_Group 4 0.002063 0.004018
AGE_GROUP_Group 5 -0.003882 0.010526
[61 rows x 2 columns]
S2 S2_conf
(CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC) 0.002185 0.016629
(CR_PROD_CNT_IL, TURNOVER_DYNAMIC_IL_1M) 0.006189 0.014603
(CR_PROD_CNT_IL, AMOUNT_RUB_SUP_PRC) 0.006189 0.014603
(CR_PROD_CNT_IL, REST_DYNAMIC_FDEP_1M) 0.002185 0.016629
(CR_PROD_CNT_IL, REST_DYNAMIC_SAVE_3M) 0.006189 0.014603
... ... ...
(AGE_GROUP_Group 2, AGE_GROUP_Group 4) 0.001941 0.006317
(AGE_GROUP_Group 2, AGE_GROUP_Group 5) 0.001941 0.006317
(AGE_GROUP_Group 3, AGE_GROUP_Group 4) 0.008008 0.018562
(AGE_GROUP_Group 3, AGE_GROUP_Group 5) 0.012012 0.016941
(AGE_GROUP_Group 4, AGE_GROUP_Group 5) -0.002063 0.004018
[1830 rows x 2 columns]
Sobol First Order Indices: [ 3.88192408e-03 -2.43847240e-04 -5.82288612e-03 -2.43847240e-04
4.00384770e-03 -2.18480928e-03 -1.94096204e-03 5.01212504e-01
1.94096204e-03 -1.00705811e-02 -2.18480928e-03 -4.00384770e-03
3.88192408e-03 -3.76000046e-03 1.45621232e-02 1.94096204e-03
-1.94096204e-03 -5.94480974e-03 6.18865698e-03 -1.94096204e-03
-2.06288566e-03 2.99678959e-02 1.46840469e-02 4.00384770e-03
-3.88192408e-03 4.30767516e-02 0.00000000e+00 7.15423005e-03
9.94865743e-03 -4.00384770e-03 -4.00384770e-03 2.62078955e-02
2.43847240e-04 -1.09240464e-02 -1.61373144e-02 -2.18480928e-03
1.94096204e-03 -3.88192408e-03 -1.94096204e-03 1.81903842e-03
-1.81903842e-03 -4.00384770e-03 -6.18865698e-03 4.00384770e-03
-2.06288566e-03 -4.12577132e-03 -4.24769494e-03 -6.06673336e-03
0.00000000e+00 0.00000000e+00 -1.94096204e-03 -2.18480928e-03
-7.88577178e-03 -4.00384770e-03 6.06673336e-03 4.44104190e-19
2.43847240e-04 -1.94096204e-03 -6.06673336e-03 2.06288566e-03
-3.88192408e-03]
Sobol Total Order Indices: [0.02802693 0.02001924 0.03002886 0.02001924 0.02402309 0.03403271
0.01801731 0.82479263 0.00600577 0.06606349 0.03002886 0.01601539
0.02001924 0.01601539 0.23422509 0.00600577 0.01401347 0.01000962
0.05805579 0.01000962 0.01000962 0.17016353 0.17016353 0.0080077
0.08808465 0.23022124 0.02802693 0.06005772 0.03803655 0.03603463
0.01201154 0.09809427 0.02402309 0.11811351 0.04804617 0.12211735
0.00200192 0.03203078 0.02602501 0.0420404 0.01801731 0.00400385
0.02202116 0.01601539 0.03002886 0.02802693 0.04003848 0.03803655
0. 0.0080077 0.00200192 0.04604425 0.02402309 0.03603463
0.01000962 0.02802693 0.02402309 0.00600577 0.03002886 0.00200192
0.01601539]
most_influential_params = np.argsort(sobol_results['ST'])[::-1]
# Visualizing Total-Order Sensitivity Indices using a bar plot
plt.figure(figsize=(15, 8))
plt.bar(problem['names'], sobol_results['ST'])
plt.title('Total-Order Sensitivity Indices')
plt.xlabel('Parameter')
plt.ylabel('Total-Order Sensitivity Index')
plt.xticks(rotation=90)
plt.show()
threshold = 0.1
selected_features = [problem['names'][i] for i in range(len(problem['names'])) if sobol_results['ST'][i] > threshold]
print(selected_features)
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]
['REST_AVG_CUR', 'TRANS_COUNT_ATM_PRC', 'REST_DYNAMIC_CUR_1M', 'REST_AVG_PAYM', 'TURNOVER_DYNAMIC_CUR_1M', 'CLNT_SETUP_TENOR', 'TURNOVER_DYNAMIC_PAYM_1M']
X_train_usampled, y_train_usampled = perform_random_undersampling(X_train_selected, y_train)
# Before undersampling
print("Before undersampling:")
print(y_train.value_counts())
# After undersampling
print("\nAfter undersampling:")
print(y_train_undersampled.value_counts())
Before undersampling: TARGET 0 228299 1 20334 Name: count, dtype: int64 After undersampling: TARGET 0 20334 1 20334 Name: count, dtype: int64
# Initialize the StandardScaler object
scaler2 = StandardScaler()
# Fit and transform the training features
X_train_scaled = scaler2.fit_transform(X_train_usampled)
# Transform the test features using the parameters learned from the training data
X_test_scaled = scaler2.transform(X_test_selected)
# Set up Logistic Regression model
log_reg = LogisticRegression()
# Set up parameter grid for GridSearchCV
param_grid = {'C': [0.1, 1, 10],'penalty': ['l1', 'l2']}
# Set up GridSearchCV with k-fold cross-validation
grid_search = GridSearchCV(log_reg, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
# Fit GridSearchCV to training data
grid_search.fit(X_train_scaled, y_train_usampled)
GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
param_grid={'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
param_grid={'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
scoring='accuracy')LogisticRegression()
LogisticRegression()
X_train_scaled.shape
(40668, 7)
y_train_usampled.shape
(40668,)
# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
# Use best parameters to build the final model
final_model = LogisticRegression(**best_params)
final_model.fit(X_train_scaled, y_train_usampled)
Best Parameters: {'C': 10, 'penalty': 'l2'}
LogisticRegression(C=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(C=10)
# Predictions on the training set using the final Logistic Regression model
y_train_pred = final_model.predict(X_train_scaled)
# Probabilities for the positive class on the training set
y_train_prob = final_model.predict_proba(X_train_scaled)[:, 1]
# Compute the confusion matrix on the training set
cm_train = confusion_matrix(y_train_usampled, y_train_pred)
# Compute accuracy, precision, recall, and F1 score on the training set
accuracy_train = (cm_train[0, 0] + cm_train[1, 1]) / np.sum(cm_train)
precision_train = precision_score(y_train_usampled, y_train_pred)
recall_train = recall_score(y_train_usampled, y_train_pred)
f1_train = f1_score(y_train_usampled, y_train_pred)
# Compute the ROC AUC score on the training set
roc_auc_train = roc_auc_score(y_train_usampled, y_train_prob)
# Compute ROC curve for training set
fpr_train, tpr_train, _ = roc_curve(y_train_usampled, y_train_prob)
print("Confusion Matrix (Training Set):\n", cm_train)
print("Accuracy (Train):", accuracy_train)
print("Precision (Train):", precision_train)
print("Recall (Train):", recall_train)
print("F1 Score (Train):", f1_train)
print("ROC AUC Score (Train):", roc_auc_train)
Confusion Matrix (Training Set): [[12340 7994] [ 5952 14382]] Accuracy (Train): 0.6570768171535359 Precision (Train): 0.6427422238112264 Recall (Train): 0.7072882856299794 F1 Score (Train): 0.6734722547412784 ROC AUC Score (Train): 0.716468372977995
X_test_scaled.shape
(106557, 7)
y_test.shape
(106557,)
# Predictions on the test set using the final Logistic Regression model
y_test_pred = final_model.predict(X_test_scaled)
# Probabilities for the positive class on the test set
y_test_prob = final_model.predict_proba(X_test_scaled)[:, 1]
# Compute the confusion matrix on the test set
cm_test = confusion_matrix(y_test, y_test_pred)
# Compute accuracy, precision, recall, and F1 score on the test set
accuracy_test = (cm_test[0, 0] + cm_test[1, 1]) / np.sum(cm_test)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)
# Compute the ROC AUC score on the test set
roc_auc_test = roc_auc_score(y_test, y_test_prob)
print("Confusion Matrix (Test Set):\n", cm_test)
print("Accuracy (Test):", accuracy_test)
print("Precision (Test):", precision_test)
print("Recall (Test):", recall_test)
print("F1 Score (Test):", f1_test)
print("ROC AUC Score (Test):", roc_auc_test)
Confusion Matrix (Test Set): [[59006 38960] [ 2534 6057]] Accuracy (Test): 0.6105933913304616 Precision (Test): 0.13454917031343713 Recall (Test): 0.7050401583052032 F1 Score (Test): 0.2259737352633935 ROC AUC Score (Test): 0.7128585583248431
# Compute ROC curve for test set
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_prob)
# Plot ROC curve for both training and test sets
plt.figure(figsize=(10, 5))
plt.plot(fpr_train, tpr_train, color='blue', lw=2, label=f'Train ROC curve (AUC = {roc_auc_train:.2f})')
plt.plot(fpr_test, tpr_test, color='red', lw=2, label=f'Test ROC curve (AUC = {roc_auc_test:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
feature_coefficients = final_model.coef_[0]
# Create a DataFrame to store feature names and coefficients
coefficients_df = pd.DataFrame({'Feature': X_train_selected.columns, 'Coefficient': feature_coefficients})
coefficients_df = coefficients_df.sort_values(by='Coefficient', ascending=False)
# Print the feature coefficients
print("Feature Coefficients:")
print(coefficients_df)
# Plot feature coefficients
plt.figure(figsize=(8,5))
plt.barh(coefficients_df['Feature'], coefficients_df['Coefficient'])
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Coefficients')
plt.show()
Feature Coefficients:
Feature Coefficient
4 TURNOVER_DYNAMIC_CUR_1M 0.456687
1 TRANS_COUNT_ATM_PRC 0.312466
6 TURNOVER_DYNAMIC_PAYM_1M 0.217990
3 REST_AVG_PAYM -0.189524
5 CLNT_SETUP_TENOR -0.225847
2 REST_DYNAMIC_CUR_1M -0.415855
0 REST_AVG_CUR -1.298116
Confusion Matrix (Training Set): [[12340 7994] [ 5952 14382]] Accuracy (Train): 0.6570768171535359 Precision (Train): 0.6427422238112264 Recall (Train): 0.7072882856299794 F1 Score (Train): 0.6734722547412784 ROC AUC Score (Train): 0.716468372977995 Confusion Matrix (Test Set): [[59006 38960] [ 2534 6057]] Accuracy (Test): 0.6105933913304616 Precision (Test): 0.13454917031343713 Recall (Test): 0.7050401583052032 F1 Score (Test): 0.2259737352633935 ROC AUC Score (Test): 0.7128585583248431
X_train_undersampled, y_train_undersampled = perform_random_undersampling(X_train, y_train)
# Initialize the StandardScaler object
scaler = StandardScaler()
# Fit and transform the training features
X_train_scaled = scaler.fit_transform(X_train_undersampled)
# Transform the test features using the parameters learned from the training data
X_test_scaled = scaler.transform(X_test)
# Specify the desired sample size
sample_size = 1000 # Adjust this according to your requirements
# Randomly sample the training data
X_train_sampled, y_train_sampled = resample(X_train_scaled, y_train_undersampled, n_samples=sample_size, random_state=42)
# Initialize the Support Vector Classifier
svm = SVC(probability=True)
# Set up parameter grid for GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01, 0.001], 'kernel': ['linear', 'rbf']}
# Set up GridSearchCV with k-fold cross-validation
grid_search_svm = GridSearchCV(svm, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
# Fit GridSearchCV to sampled training data
grid_search_svm.fit(X_train_sampled, y_train_sampled)
# Get the best parameters
best_params_svm = grid_search_svm.best_params_
print("Best Parameters:", best_params_svm)
# Use best parameters to build the final SVM model
final_svm_model = SVC(**best_params_svm, probability=True)
final_svm_model.fit(X_train_sampled, y_train_sampled)
Best Parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
SVC(C=10, gamma=0.01, probability=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(C=10, gamma=0.01, probability=True)
X_train_scaled.shape
(40668, 61)
y_test.shape
(106557,)
# Predictions on the training set using the final SVM model
y_train_pred_svm = final_svm_model.predict(X_train_sampled)
# Probabilities for the positive class on the training set
y_train_prob_svm = final_svm_model.predict_proba(X_train_sampled)[:, 1]
# Compute the confusion matrix on the training set for SVM
cm_train_svm = confusion_matrix(y_train_sampled, y_train_pred_svm)
# Compute accuracy, precision, recall, and F1 score on the training set for SVM
accuracy_train_svm = (cm_train_svm[0, 0] + cm_train_svm[1, 1]) / np.sum(cm_train_svm)
precision_train_svm = precision_score(y_train_sampled, y_train_pred_svm)
recall_train_svm = recall_score(y_train_sampled, y_train_pred_svm)
f1_train_svm = f1_score(y_train_sampled, y_train_pred_svm)
# Compute the ROC AUC score on the training set for SVM
roc_auc_train_svm = roc_auc_score(y_train_sampled, y_train_prob_svm)
# Compute ROC curve for training set for SVM
fpr_train_svm, tpr_train_svm, _ = roc_curve(y_train_sampled, y_train_prob_svm)
print("Confusion Matrix (Training Set - SVM):\n", cm_train_svm)
print("Accuracy (Train - SVM):", accuracy_train_svm)
print("Precision (Train - SVM):", precision_train_svm)
print("Recall (Train - SVM):", recall_train_svm)
print("F1 Score (Train - SVM):", f1_train_svm)
print("ROC AUC Score (Train - SVM):", roc_auc_train_svm)
Confusion Matrix (Training Set - SVM): [[444 72] [ 93 391]] Accuracy (Train - SVM): 0.835 Precision (Train - SVM): 0.8444924406047516 Recall (Train - SVM): 0.8078512396694215 F1 Score (Train - SVM): 0.8257655755015839 ROC AUC Score (Train - SVM): 0.9114032769556025
# Predictions on the test set using the final SVM model
y_test_pred_svm = final_svm_model.predict(X_test_scaled)
# Probabilities for the positive class on the test set
y_test_prob_svm = final_svm_model.predict_proba(X_test_scaled)[:, 1]
# Compute the confusion matrix on the test set for SVM
cm_test_svm = confusion_matrix(y_test, y_test_pred_svm)
# Compute accuracy, precision, recall, and F1 score on the test set for SVM
accuracy_test_svm = (cm_test_svm[0, 0] + cm_test_svm[1, 1]) / np.sum(cm_test_svm)
precision_test_svm = precision_score(y_test, y_test_pred_svm)
recall_test_svm = recall_score(y_test, y_test_pred_svm)
f1_test_svm = f1_score(y_test, y_test_pred_svm)
# Compute the ROC AUC score on the test set for SVM
roc_auc_test_svm = roc_auc_score(y_test, y_test_prob_svm)
print("Confusion Matrix (Test Set - SVM):\n", cm_test_svm)
print("Accuracy (Test - SVM):", accuracy_test_svm)
print("Precision (Test - SVM):", precision_test_svm)
print("Recall (Test - SVM):", recall_test_svm)
print("F1 Score (Test - SVM):", f1_test_svm)
print("ROC AUC Score (Test - SVM):", roc_auc_test_svm)
Confusion Matrix (Test Set - SVM): [[63777 34189] [ 3423 5168]] Accuracy (Test - SVM): 0.6470245971639592 Precision (Test - SVM): 0.13131082145488732 Recall (Test - SVM): 0.6015597718542661 F1 Score (Test - SVM): 0.215566864102778 ROC AUC Score (Test - SVM): 0.6812774706818494
# Compute ROC curve for test set for SVM
fpr_test_svm, tpr_test_svm, _ = roc_curve(y_test, y_test_prob_svm)
# Plot ROC curve for both training and test sets for SVM
plt.figure(figsize=(10, 5))
plt.plot(fpr_train_svm, tpr_train_svm, color='blue', lw=2, label=f'Train ROC curve (AUC = {roc_auc_train_svm:.2f})')
plt.plot(fpr_test_svm, tpr_test_svm, color='red', lw=2, label=f'Test ROC curve (AUC = {roc_auc_test_svm:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve - SVM')
plt.legend()
plt.show()
import lime
from lime import lime_tabular
import random
# Initialize LIME explainer
explainer = lime_tabular.LimeTabularExplainer(X_train_sampled, mode='classification', feature_names=X_train.columns)
# Select 3 random instances for interpretation
num_instances = 3
random_indices = random.sample(range(len(X_train_sampled)), num_instances)
instances_to_interpret = X_train_sampled[random_indices]
# Convert instances to numpy array
instances_to_interpret = np.array(instances_to_interpret)
# Store local feature importance for each instance
local_feature_importance_list = []
# Calculate LIME explanations for each instance
for i, instance in enumerate(instances_to_interpret):
# Explain the prediction
explanation = explainer.explain_instance(instance, final_svm_model.predict_proba, num_features=len(X_train.columns))
# Get local feature importance
local_feature_importance = explanation.as_list()
# Store local feature importance
local_feature_importance_list.append(local_feature_importance)
# Print local feature importance for each instance
print(f"Local feature importance for instance {i + 1}:")
for feature, weight in local_feature_importance:
feature_name = feature.split(" <= ")[0]
print(f" Feature: {feature_name}, Importance: {weight}")
Local feature importance for instance 1: Feature: REST_DYNAMIC_FDEP_1M, Importance: -0.14704638840144174 Feature: REST_DYNAMIC_FDEP_3M, Importance: -0.1343561014634013 Feature: LDEAL_GRACE_DAYS_PCT_MED, Importance: -0.128537518976211 Feature: TURNOVER_DYNAMIC_IL_1M, Importance: -0.12534781358947433 Feature: TURNOVER_CC, Importance: -0.12038296708124654 Feature: TURNOVER_DYNAMIC_IL_3M, Importance: -0.09394090499906342 Feature: REST_AVG_PAYM, Importance: 0.08660016197642273 Feature: REST_DYNAMIC_PAYM_1M, Importance: 0.08104084346987353 Feature: CR_PROD_CNT_CCFP, Importance: -0.0770686421301206 Feature: TRANS_AMOUNT_TENDENCY3M > 0.95, Importance: 0.062753645021981 Feature: PACK_301, Importance: -0.0626304207084737 Feature: REST_DYNAMIC_SAVE_3M, Importance: -0.059846834410807105 Feature: TURNOVER_DYNAMIC_CUR_1M > 0.26, Importance: 0.05425795603405833 Feature: CLNT_SETUP_TENOR, Importance: 0.050496393916219476 Feature: TURNOVER_DYNAMIC_CC_1M, Importance: -0.049842379012377 Feature: CR_PROD_CNT_VCU, Importance: -0.0493855862261953 Feature: TRANS_COUNT_ATM_PRC > 0.61, Importance: 0.045205082043697 Feature: -0.27 < REST_AVG_CUR, Importance: 0.044276938051427926 Feature: REST_DYNAMIC_IL_1M, Importance: -0.0430587859453054 Feature: AGE_GROUP_Group 5, Importance: -0.04279117875755887 Feature: TURNOVER_DYNAMIC_CC_3M, Importance: -0.04204834392195089 Feature: -0.68 < REST_DYNAMIC_CUR_1M, Importance: 0.03762203899019968 Feature: CR_PROD_CNT_CC, Importance: -0.035041104520155096 Feature: CR_PROD_CNT_TOVR, Importance: -0.034870056474790724 Feature: TURNOVER_PAYM, Importance: -0.03395871354887335 Feature: PACK_104, Importance: -0.033850301141275886 Feature: AGE_GROUP_Group 4, Importance: -0.033545763215867774 Feature: REST_DYNAMIC_IL_3M, Importance: -0.03260713696437662 Feature: CR_PROD_CNT_IL, Importance: -0.030521606245448484 Feature: PACK_101, Importance: -0.03006706483002833 Feature: TRANS_CNT_TENDENCY3M > 0.77, Importance: -0.02751241165506097 Feature: CNT_TRAN_ATM_TENDENCY3M > 0.80, Importance: 0.027018138982351465 Feature: TOTAL_PROD_CNT, Importance: -0.026445103339784413 Feature: REST_DYNAMIC_CC_1M, Importance: -0.025560463404692972 Feature: SUM_TRAN_ATM_TENDENCY3M > 1.03, Importance: -0.024048910288208446 Feature: PACK_105, Importance: -0.023602784166806577 Feature: CR_PROD_CNT_PIL, Importance: -0.02334951495356999 Feature: PACK_O01, Importance: -0.02243341170398687 Feature: TURNOVER_DYNAMIC_PAYM_3M, Importance: -0.018725893410317527 Feature: -0.61 < AGE_GROUP_Group 3, Importance: 0.018304753226403927 Feature: PACK_102, Importance: -0.01569936867155987 Feature: REST_DYNAMIC_CC_3M, Importance: -0.013963631207409575 Feature: PACK_107, Importance: -0.013516881676065457 Feature: REST_DYNAMIC_PAYM_3M, Importance: 0.012806238292290906 Feature: PACK_103, Importance: -0.011665778388627774 Feature: TRANS_COUNT_NAS_PRC, Importance: -0.011329953237167366 Feature: TURNOVER_DYNAMIC_PAYM_1M, Importance: 0.01126833693208732 Feature: TRANS_COUNT_SUP_PRC, Importance: -0.010986207756181577 Feature: AMOUNT_RUB_CLO_PRC, Importance: 0.009156705895887664 Feature: -0.19 < total_turnover, Importance: -0.00903862769297212 Feature: TURNOVER_DYNAMIC_CUR_3M > 0.97, Importance: 0.008842021310980677 Feature: AMOUNT_RUB_SUP_PRC, Importance: -0.00879065079255652 Feature: PACK_K01 > -0.55, Importance: -0.008051215081985141 Feature: AMOUNT_RUB_ATM_PRC > 0.93, Importance: 0.0078060771537052575 Feature: AGE_GROUP_Group 1, Importance: -0.007720580835343515 Feature: AMOUNT_RUB_NAS_PRC, Importance: -0.005815733831187624 Feature: REST_DYNAMIC_CUR_3M > 0.91, Importance: -0.004951308508316479 Feature: AGE_GROUP_Group 2, Importance: 0.002430761940829926 Feature: PACK_108, Importance: 0.0 Feature: PACK_109, Importance: 0.0 Feature: PACK_M01, Importance: 0.0 Local feature importance for instance 2: Feature: LDEAL_GRACE_DAYS_PCT_MED, Importance: -0.15033962639953805 Feature: TURNOVER_DYNAMIC_IL_1M, Importance: -0.13162791320437056 Feature: REST_DYNAMIC_FDEP_3M, Importance: -0.12209109931776727 Feature: TURNOVER_CC, Importance: -0.1181814835312058 Feature: REST_DYNAMIC_CUR_1M > 0.24, Importance: -0.11175606909627735 Feature: REST_DYNAMIC_FDEP_1M, Importance: -0.1067421436640174 Feature: REST_AVG_PAYM, Importance: 0.09413727496114302 Feature: TURNOVER_DYNAMIC_CC_1M, Importance: -0.08513845392246741 Feature: TURNOVER_DYNAMIC_IL_3M, Importance: -0.07879574593275772 Feature: CR_PROD_CNT_VCU, Importance: -0.07605456474547337 Feature: REST_DYNAMIC_PAYM_1M, Importance: 0.07491733355085692 Feature: TRANS_AMOUNT_TENDENCY3M > 0.95, Importance: 0.06960093640113842 Feature: AGE_GROUP_Group 5, Importance: -0.061405751604960576 Feature: REST_DYNAMIC_SAVE_3M, Importance: -0.06055926129782365 Feature: REST_DYNAMIC_CC_1M, Importance: -0.060404267234258686 Feature: TURNOVER_DYNAMIC_CUR_1M > 0.26, Importance: 0.05973097568919207 Feature: REST_AVG_CUR, Importance: 0.059293602441898415 Feature: CR_PROD_CNT_CCFP, Importance: -0.057775266630382106 Feature: PACK_104, Importance: -0.054858303529103335 Feature: PACK_301, Importance: -0.05452163816277369 Feature: REST_DYNAMIC_IL_1M, Importance: -0.049760705020195646 Feature: CR_PROD_CNT_TOVR, Importance: -0.04849033889735181 Feature: TRANS_CNT_TENDENCY3M > 0.77, Importance: -0.03697573423330415 Feature: -0.88 < CLNT_SETUP_TENOR, Importance: 0.03565213994941347 Feature: CR_PROD_CNT_IL, Importance: -0.03450365483177571 Feature: CR_PROD_CNT_PIL, Importance: -0.03269968022081783 Feature: TOTAL_PROD_CNT, Importance: -0.031946135899510515 Feature: CNT_TRAN_ATM_TENDENCY3M > 0.80, Importance: 0.03140530455953711 Feature: TURNOVER_PAYM, Importance: -0.030461971546487544 Feature: SUM_TRAN_ATM_TENDENCY3M > 1.03, Importance: -0.028745886982190702 Feature: REST_DYNAMIC_CC_3M, Importance: -0.027742188798718693 Feature: PACK_101, Importance: -0.024733199307204797 Feature: TURNOVER_DYNAMIC_PAYM_3M, Importance: -0.02143200613060704 Feature: -0.67 < PACK_102, Importance: 0.019538779413642775 Feature: AMOUNT_RUB_CLO_PRC, Importance: 0.018906796582977704 Feature: REST_DYNAMIC_IL_3M, Importance: -0.018181765666110246 Feature: CR_PROD_CNT_CC, Importance: -0.016856014358800037 Feature: AGE_GROUP_Group 3, Importance: -0.01668374828399379 Feature: PACK_105, Importance: -0.016095334248290762 Feature: AGE_GROUP_Group 4, Importance: -0.015354812188503283 Feature: PACK_O01, Importance: -0.015040748735270566 Feature: TRANS_COUNT_NAS_PRC, Importance: -0.012957698089780464 Feature: AMOUNT_RUB_NAS_PRC, Importance: -0.011919845347396266 Feature: -0.19 < total_turnover, Importance: -0.011408083618180919 Feature: PACK_103, Importance: -0.011005188551739329 Feature: TRANS_COUNT_SUP_PRC, Importance: -0.010266928591673125 Feature: REST_DYNAMIC_PAYM_3M, Importance: 0.009813522843497855 Feature: TURNOVER_DYNAMIC_CC_3M, Importance: -0.009020165792229867 Feature: PACK_K01, Importance: 0.00844230707037991 Feature: -0.75 < AMOUNT_RUB_ATM_PRC, Importance: -0.007046573174242423 Feature: AMOUNT_RUB_SUP_PRC, Importance: 0.0057512658587970085 Feature: PACK_107, Importance: -0.005455849179279083 Feature: -0.15 < TRANS_COUNT_ATM_PRC, Importance: 0.004573793532338947 Feature: TURNOVER_DYNAMIC_CUR_3M > 0.97, Importance: 0.004270413539247264 Feature: TURNOVER_DYNAMIC_PAYM_1M, Importance: 0.0035055109408266235 Feature: -1.09 < AGE_GROUP_Group 2, Importance: -0.0028267596092826906 Feature: REST_DYNAMIC_CUR_3M > 0.91, Importance: -0.001372786973392137 Feature: AGE_GROUP_Group 1, Importance: -0.0010534799595602927 Feature: PACK_M01, Importance: 0.0 Feature: PACK_108, Importance: 0.0 Feature: PACK_109, Importance: 0.0 Local feature importance for instance 3: Feature: LDEAL_GRACE_DAYS_PCT_MED, Importance: -0.18273374408271564 Feature: REST_DYNAMIC_FDEP_1M, Importance: -0.15247721830109842 Feature: TURNOVER_DYNAMIC_CC_1M, Importance: -0.15007838957166184 Feature: TURNOVER_DYNAMIC_IL_1M, Importance: -0.1340305217061444 Feature: TURNOVER_CC, Importance: -0.1252740006590734 Feature: REST_DYNAMIC_FDEP_3M, Importance: -0.12247812900900056 Feature: AGE_GROUP_Group 5, Importance: -0.1170846831336262 Feature: TURNOVER_DYNAMIC_IL_3M, Importance: -0.11127853178152769 Feature: REST_AVG_PAYM > -0.16, Importance: -0.09388590232178452 Feature: CR_PROD_CNT_VCU, Importance: -0.08681019637887757 Feature: REST_DYNAMIC_CUR_1M, Importance: 0.08066765418119314 Feature: REST_DYNAMIC_PAYM_1M > -0.29, Importance: -0.07128051725401972 Feature: CR_PROD_CNT_CCFP, Importance: -0.06631842572320816 Feature: REST_DYNAMIC_SAVE_3M, Importance: -0.06382160983065908 Feature: REST_AVG_CUR, Importance: 0.058925603135970804 Feature: PACK_104, Importance: -0.05803740348369156 Feature: TRANS_COUNT_SUP_PRC > 0.53, Importance: 0.0493869118647884 Feature: REST_DYNAMIC_IL_3M, Importance: -0.04675463404818441 Feature: TURNOVER_DYNAMIC_CC_3M, Importance: -0.04565866952598865 Feature: PACK_301, Importance: -0.04152068067444318 Feature: REST_DYNAMIC_CC_3M, Importance: -0.04015092895731365 Feature: CR_PROD_CNT_TOVR, Importance: -0.03673529812106345 Feature: TURNOVER_PAYM > -0.19, Importance: 0.03370349033155072 Feature: total_turnover > -0.19, Importance: 0.032315697514408215 Feature: REST_DYNAMIC_CC_1M, Importance: -0.030857073495342422 Feature: AGE_GROUP_Group 4, Importance: -0.03050067755754937 Feature: CR_PROD_CNT_IL, Importance: -0.02784785164336459 Feature: TURNOVER_DYNAMIC_CUR_1M, Importance: -0.027460786032147747 Feature: TOTAL_PROD_CNT, Importance: -0.027037710216433575 Feature: REST_DYNAMIC_IL_1M, Importance: -0.0249776956129783 Feature: CR_PROD_CNT_PIL, Importance: -0.024814933330928164 Feature: CR_PROD_CNT_CC, Importance: -0.02395966963025233 Feature: TRANS_COUNT_ATM_PRC, Importance: -0.023703950629053713 Feature: AMOUNT_RUB_CLO_PRC > -0.15, Importance: -0.022461252823012482 Feature: PACK_105 > -0.42, Importance: 0.02144722103133847 Feature: AGE_GROUP_Group 3, Importance: -0.01824763788508651 Feature: PACK_103, Importance: -0.017616403428768183 Feature: TURNOVER_DYNAMIC_PAYM_3M > -0.39, Importance: 0.016606445745080942 Feature: -0.07 < TRANS_AMOUNT_TENDENCY3M, Importance: 0.015536407473162435 Feature: PACK_101, Importance: -0.015198224966769647 Feature: PACK_K01, Importance: 0.013833835672309017 Feature: REST_DYNAMIC_PAYM_3M > -0.41, Importance: -0.013120603181323976 Feature: -0.07 < TRANS_CNT_TENDENCY3M, Importance: -0.012527493299462861 Feature: REST_DYNAMIC_CUR_3M, Importance: 0.012300638932376417 Feature: PACK_O01, Importance: -0.011530029344972046 Feature: TURNOVER_DYNAMIC_CUR_3M, Importance: -0.011511048620943327 Feature: TRANS_COUNT_NAS_PRC, Importance: -0.01123294967664602 Feature: AMOUNT_RUB_NAS_PRC, Importance: -0.010371804849121369 Feature: AMOUNT_RUB_ATM_PRC, Importance: -0.009909888561614328 Feature: PACK_102, Importance: -0.00949138013166266 Feature: -0.22 < CLNT_SETUP_TENOR, Importance: -0.008450207614140896 Feature: AGE_GROUP_Group 1, Importance: -0.007611115016231038 Feature: PACK_107, Importance: -0.006491310042647481 Feature: TURNOVER_DYNAMIC_PAYM_1M > -0.27, Importance: -0.004371326696409258 Feature: -0.49 < SUM_TRAN_ATM_TENDENCY3M, Importance: -0.0011512691169009409 Feature: AMOUNT_RUB_SUP_PRC > 0.05, Importance: -0.0007911555254184891 Feature: -1.09 < AGE_GROUP_Group 2, Importance: 0.000412829614688812 Feature: -0.60 < CNT_TRAN_ATM_TENDENCY3M, Importance: -0.00012432112300862357 Feature: PACK_M01, Importance: 0.0 Feature: PACK_108, Importance: 0.0 Feature: PACK_109, Importance: 0.0
# Visualize local feature importance
for i, local_feature_importance in enumerate(local_feature_importance_list):
plt.figure(figsize=(20, 20))
features = [feature.split(" <= ")[0] for feature, _ in local_feature_importance]
weights = [weight for _, weight in local_feature_importance]
plt.barh(features, weights, color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title(f'Local Feature Importance for Instance {i+1}')
plt.gca().invert_yaxis() # Invert y-axis to have the most important feature at the top
plt.show()
# Select one instance for interpretation
instance_to_interpret = X_train_sampled[0]
# Store global feature importance
global_feature_importance = np.zeros(X_train_sampled.shape[1])
# Calculate LIME explanations for each feature
for feature_index in range(len(X_train.columns)):
# Perturb the current feature
instance_perturbed = np.array(instance_to_interpret)
instance_perturbed[feature_index] = np.random.choice(X_train_sampled[:, feature_index])
# Explain the prediction for the perturbed instance
explanation = explainer.explain_instance(instance_perturbed, final_svm_model.predict_proba, num_features=len(X_train.columns))
# Get the weights for each feature
weights = explanation.local_exp[1]
# Aggregate the weights for the current feature
global_feature_importance[feature_index] = np.mean([weight for _, weight in weights])
# Sort features by importance
sorted_indices = np.argsort(global_feature_importance)[::-1]
# Print global feature importance
print("\nGlobal feature importance:")
for idx in sorted_indices:
print(f"Feature: {X_train.columns[idx]}, Importance: {global_feature_importance[idx]}")
Global feature importance: Feature: TURNOVER_DYNAMIC_PAYM_3M, Importance: -0.02402635451036813 Feature: TRANS_COUNT_ATM_PRC, Importance: -0.024085165851705306 Feature: PACK_107, Importance: -0.0242412017054293 Feature: AMOUNT_RUB_SUP_PRC, Importance: -0.02496658279819607 Feature: TOTAL_PROD_CNT, Importance: -0.025601312618376088 Feature: PACK_103, Importance: -0.026088222683597853 Feature: PACK_105, Importance: -0.02620470391843965 Feature: REST_DYNAMIC_SAVE_3M, Importance: -0.026284698433760672 Feature: TRANS_COUNT_NAS_PRC, Importance: -0.02653705871451709 Feature: TURNOVER_DYNAMIC_IL_3M, Importance: -0.026635751337777673 Feature: REST_DYNAMIC_CC_3M, Importance: -0.02670042056654317 Feature: LDEAL_GRACE_DAYS_PCT_MED, Importance: -0.026782281274570965 Feature: PACK_301, Importance: -0.0267929318519944 Feature: REST_AVG_CUR, Importance: -0.026827069687588103 Feature: PACK_108, Importance: -0.02700023351233272 Feature: REST_DYNAMIC_PAYM_3M, Importance: -0.02700478565935417 Feature: REST_DYNAMIC_IL_3M, Importance: -0.027055387571754796 Feature: CR_PROD_CNT_PIL, Importance: -0.02706837782232121 Feature: PACK_101, Importance: -0.02709359051498582 Feature: REST_DYNAMIC_CUR_3M, Importance: -0.02712206503117224 Feature: PACK_102, Importance: -0.027268521243739675 Feature: TURNOVER_DYNAMIC_CC_3M, Importance: -0.027276029325054258 Feature: TRANS_AMOUNT_TENDENCY3M, Importance: -0.02733801445210523 Feature: SUM_TRAN_ATM_TENDENCY3M, Importance: -0.02736884684856256 Feature: TURNOVER_CC, Importance: -0.02744184629552572 Feature: CNT_TRAN_ATM_TENDENCY3M, Importance: -0.027461692809109824 Feature: REST_DYNAMIC_IL_1M, Importance: -0.027535723900900824 Feature: CR_PROD_CNT_VCU, Importance: -0.027540784412696483 Feature: TURNOVER_PAYM, Importance: -0.02756353909292793 Feature: TURNOVER_DYNAMIC_IL_1M, Importance: -0.027572678146996772 Feature: REST_DYNAMIC_PAYM_1M, Importance: -0.027620890557878364 Feature: AMOUNT_RUB_ATM_PRC, Importance: -0.0276965140593232 Feature: REST_DYNAMIC_FDEP_3M, Importance: -0.02772631636544595 Feature: TURNOVER_DYNAMIC_PAYM_1M, Importance: -0.027734178130707388 Feature: CLNT_SETUP_TENOR, Importance: -0.027755874118872863 Feature: TRANS_CNT_TENDENCY3M, Importance: -0.027760441834269452 Feature: REST_DYNAMIC_CC_1M, Importance: -0.02778434017687035 Feature: AGE_GROUP_Group 2, Importance: -0.027946610390325165 Feature: TURNOVER_DYNAMIC_CUR_3M, Importance: -0.027985174525529573 Feature: PACK_104, Importance: -0.02800479027846567 Feature: REST_DYNAMIC_FDEP_1M, Importance: -0.028047703919529928 Feature: PACK_109, Importance: -0.028152856960064792 Feature: PACK_M01, Importance: -0.028263372986734696 Feature: TRANS_COUNT_SUP_PRC, Importance: -0.02837634877554955 Feature: REST_DYNAMIC_CUR_1M, Importance: -0.028383292625548198 Feature: REST_AVG_PAYM, Importance: -0.028433052122288045 Feature: CR_PROD_CNT_CCFP, Importance: -0.028437276056741304 Feature: PACK_K01, Importance: -0.0284771983342934 Feature: CR_PROD_CNT_TOVR, Importance: -0.02853707755464891 Feature: AGE_GROUP_Group 3, Importance: -0.028539934459198127 Feature: TURNOVER_DYNAMIC_CUR_1M, Importance: -0.028544731302237936 Feature: CR_PROD_CNT_IL, Importance: -0.028559031766838685 Feature: TURNOVER_DYNAMIC_CC_1M, Importance: -0.028613844251757205 Feature: AMOUNT_RUB_CLO_PRC, Importance: -0.029002905924930548 Feature: CR_PROD_CNT_CC, Importance: -0.029090809399044893 Feature: AGE_GROUP_Group 1, Importance: -0.02913077530645805 Feature: AGE_GROUP_Group 4, Importance: -0.02931063140112495 Feature: PACK_O01, Importance: -0.029380184020841617 Feature: total_turnover, Importance: -0.029806336722668993 Feature: AGE_GROUP_Group 5, Importance: -0.030359096123964237 Feature: AMOUNT_RUB_NAS_PRC, Importance: -0.030755949369092667
# Print names of top 10 features with importance
num_top_features = 10
top_feature_indices = sorted_indices[:num_top_features]
top_feature_names = X_train.columns[top_feature_indices]
top_feature_importance = global_feature_importance[top_feature_indices]
print("Top 10 Features and Their Importance:")
for feature_name, importance in zip(top_feature_names, top_feature_importance):
print(f"Feature: {feature_name}, Importance: {importance}")
Top 10 Features and Their Importance: Feature: TURNOVER_DYNAMIC_PAYM_3M, Importance: -0.02402635451036813 Feature: TRANS_COUNT_ATM_PRC, Importance: -0.024085165851705306 Feature: PACK_107, Importance: -0.0242412017054293 Feature: AMOUNT_RUB_SUP_PRC, Importance: -0.02496658279819607 Feature: TOTAL_PROD_CNT, Importance: -0.025601312618376088 Feature: PACK_103, Importance: -0.026088222683597853 Feature: PACK_105, Importance: -0.02620470391843965 Feature: REST_DYNAMIC_SAVE_3M, Importance: -0.026284698433760672 Feature: TRANS_COUNT_NAS_PRC, Importance: -0.02653705871451709 Feature: TURNOVER_DYNAMIC_IL_3M, Importance: -0.026635751337777673
# Select the top 10 features
top_feature_indices = sorted_indices[:10]
X_train_top_features = X_train.iloc[:, top_feature_indices]
X_test_top_features = X_test.iloc[:, top_feature_indices]
X_train_undersampled, y_train_undersampled = perform_random_undersampling(X_train_top_features, y_train)
# Initialize the StandardScaler object
scaler = StandardScaler()
# Fit and transform the training features
X_train_scaled = scaler.fit_transform(X_train_undersampled)
# Transform the test features using the parameters learned from the training data
X_test_scaled = scaler.transform(X_test_top_features)
# Sample 1000 instances from the training data
sample_size = 1000
X_train_sampled = X_train_undersampled.sample(n=sample_size, random_state=42)
y_train_sampled = y_train_undersampled.sample(n=sample_size, random_state=42)
# Train an SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_sampled, y_train_sampled)
# Evaluate the model on the training data
train_accuracy = svm_model.score(X_train_sampled, y_train_sampled)
print(f"Train Accuracy: {train_accuracy:.2f}")
# Predict labels for the test data
y_pred_test = svm_model.predict(X_test_top_features)
# Evaluate the model on the test data
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_accuracy:.2f}")
Train Accuracy: 0.60 Test Accuracy: 0.66
y_pred_train = svm_model.predict(X_train_sampled)
train_precision = precision_score(y_train_sampled, y_pred_train)
train_recall = recall_score(y_train_sampled, y_pred_train)
train_f1 = f1_score(y_train_sampled, y_pred_train)
print("Training Metrics:")
print(f"Precision: {train_precision:.2f}")
print(f"Recall: {train_recall:.2f}")
print(f"F1-score: {train_f1:.2f}")
# Precision, Recall, and F1-score for Testing Data
test_precision = precision_score(y_test, y_pred_test)
test_recall = recall_score(y_test, y_pred_test)
test_f1 = f1_score(y_test, y_pred_test)
print("\nTesting Metrics:")
print(f"Precision: {test_precision:.2f}")
print(f"Recall: {test_recall:.2f}")
print(f"F1-score: {test_f1:.2f}")
# ROC Curve for Training Data
train_y_scores = svm_model.decision_function(X_train_sampled)
train_fpr, train_tpr, train_thresholds = roc_curve(y_train_sampled, train_y_scores)
train_auc = roc_auc_score(y_train_sampled, train_y_scores)
# ROC Curve for Testing Data
test_y_scores = svm_model.decision_function(X_test_top_features)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, test_y_scores)
test_auc = roc_auc_score(y_test, test_y_scores)
# Plot ROC Curves
plt.figure(figsize=(10, 6))
plt.plot(train_fpr, train_tpr, "b:", linewidth=2, label="Train ROC Curve (AUC = {:.2f})".format(train_auc))
plt.plot(test_fpr, test_tpr, "r-", linewidth=2, label="Test ROC Curve (AUC = {:.2f})".format(test_auc))
plt.plot([0, 1], [0, 1], 'k--') # Dashed diagonal
plt.legend(loc="lower right", fontsize=14)
plt.xlabel("False Positive Rate (FPR)", fontsize=14)
plt.ylabel("True Positive Rate (TPR)", fontsize=14)
plt.title("ROC Curve", fontsize=16)
plt.grid(True)
plt.show()
Training Metrics: Precision: 0.66 Recall: 0.51 F1-score: 0.57 Testing Metrics: Precision: 0.12 Recall: 0.50 F1-score: 0.19
We found AGE_GROUP_Group 4, LDEAL_GRACE_DAYS_PCT_MED, PACK_301, TURNOVER_CC, etc were the most effective local features Now, for Global Feature, we can see that these 10 were the top selected features for the model and are mentioned below along with their importance value. Feature: AGE_GROUP_Group 4, Importance: -0.0027253763333527697 Feature: AMOUNT_RUB_ATM_PRC, Importance: -0.0029518696464307425 Feature: AMOUNT_RUB_SUP_PRC, Importance: -0.0030826754407552153 Feature: TRANS_COUNT_NAS_PRC, Importance: -0.0030865940822398944 Feature: TURNOVER_DYNAMIC_CUR_1M, Importance: -0.0031280254084610206 Feature: TURNOVER_DYNAMIC_CC_3M, Importance: -0.003149758587542671 Feature: LDEAL_GRACE_DAYS_PCT_MED, Importance: -0.003168928654897231 Feature: CNT_TRAN_ATM_TENDENCY3M, Importance: -0.0031717590710851422 Feature: REST_DYNAMIC_PAYM_3M, Importance: -0.00317533773450771 Feature: REST_DYNAMIC_CUR_1M, Importance: -0.003177373507806623`
X_train_undersampled, y_train_undersampled = perform_random_undersampling(X_train, y_train)
# Initialize the StandardScaler object
scaler = StandardScaler()
# Fit and transform the training features
X_train_scaled = scaler.fit_transform(X_train_undersampled)
# Transform the test features using the parameters learned from the training data
X_test_scaled = scaler.transform(X_test)
# Specify the desired sample size
sample_size = 100 # Adjust this according to your requirements
# Randomly sample the training data
X_train_sampled, y_train_sampled = resample(X_train_scaled, y_train_undersampled, n_samples=sample_size, random_state=42)
# Initialize the Support Vector Classifier
svm = SVC(probability=True)
import shap
# Define a function to convert SVC model into a black-box model
def f(X):
return final_svm_model.predict_proba(X)[:, 1]
# Summarize the background data using shap.sample or shap.kmeans
background_summary = shap.sample(X_train_sampled[:30], 30) # Adjust the number of samples as needed
# Initialize SHAP kernel explainer with the black-box model function and summarized background data
explainer = shap.KernelExplainer(f, background_summary)
# Calculate SHAP values for a set of instances
shap_values = explainer.shap_values(X_train_sampled[:3])
# Plot local explanations for the first three instances
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, X_train_sampled[:3])
# Calculate SHAP values for the entire training dataset
shap_values_all = explainer.shap_values(X_train_sampled)
# Summary plot for global feature importance
shap.summary_plot(shap_values_all, X_train_sampled)
0%| | 0/3 [00:00<?, ?it/s]
0%| | 0/100 [00:00<?, ?it/s]
We took 20 features for our local features along with their value.
# Define a threshold for SHAP values
threshold = 0.001 # Adjust this threshold as needed
# Find features with SHAP values above the threshold
above_threshold_features = np.where(np.abs(shap_values_all).mean(0) > threshold)[0]
# Check if any features are selected
if len(above_threshold_features) == 0:
print("No features found above the threshold. Try lowering the threshold.")
else:
# Extract the features above the threshold
X_train_selected = X_train_sampled[:, above_threshold_features]
# Split the data into training and testing sets
X_train_selected, X_test_selected, y_train, y_test = train_test_split(X_train_selected, y_train_sampled, test_size=0.2, random_state=42)
# Train a new model on the selected features
new_model = SVC(probability=True) # You can change the model if desired
new_model.fit(X_train_selected, y_train)
# Evaluate the new model
accuracy = new_model.score(X_test_selected, y_test)
print("Accuracy of the new model on the test set:", accuracy)
Accuracy of the new model on the test set: 0.75
# Train a new model on the selected features
new_model = SVC(probability=True) # You can change the model if desired
new_model.fit(X_train_selected, y_train)
# Make predictions on training and testing data
y_train_pred = new_model.predict(X_train_selected)
y_test_pred = new_model.predict(X_test_selected)
# Calculate evaluation metrics for training set
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_roc_auc = roc_auc_score(y_train, new_model.predict_proba(X_train_selected)[:, 1])
# Calculate evaluation metrics for test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, new_model.predict_proba(X_test_selected)[:, 1])
print("Training Set Metrics:")
print("Accuracy:", train_accuracy)
print("Precision:", train_precision)
print("Recall:", train_recall)
print("F1 Score:", train_f1)
print("ROC AUC Score:", train_roc_auc)
print("\nTest Set Metrics:")
print("Accuracy:", test_accuracy)
print("Precision:", test_precision)
print("Recall:", test_recall)
print("F1 Score:", test_f1)
print("ROC AUC Score:", test_roc_auc)
# ROC Curve for training set
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, new_model.predict_proba(X_train_selected)[:, 1])
# ROC Curve for test set
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, new_model.predict_proba(X_test_selected)[:, 1])
Training Set Metrics: Accuracy: 0.8625 Precision: 0.9310344827586207 Recall: 0.75 F1 Score: 0.8307692307692308 ROC AUC Score: 0.9494949494949496 Test Set Metrics: Accuracy: 0.75 Precision: 0.75 Recall: 0.42857142857142855 F1 Score: 0.5454545454545454 ROC AUC Score: 0.8131868131868132
# Plot ROC curve for both training and test sets
plt.figure(figsize=(8, 6))
plt.plot(fpr_train, tpr_train, label='ROC Curve (Train)')
plt.plot(fpr_test, tpr_test, label='ROC Curve (Test)')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guessing')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.grid(True)
plt.show()
## Decision Tree
# Target variable
t = df['TARGET']
# Features
f = df.drop(columns=['TARGET'])
f_train, f_test, t_train, t_test = train_test_split(f, t, test_size=0.3, random_state=100)
print(f_train.shape)
print(f_test.shape)
print(t_train.shape)
print(t_test.shape)
(248633, 61) (106557, 61) (248633,) (106557,)
print(t_train.groupby(t_train).count())
TARGET 0 228299 1 20334 Name: TARGET, dtype: int64
f_train_undersampled,t_train_undersampled = perform_random_undersampling(f_train,t_train)
f_train_undersampled.shape
(40668, 61)
t_train_undersampled.shape
(40668,)
t_train_undersampled.value_counts()
TARGET 0 20334 1 20334 Name: count, dtype: int64
scaler = StandardScaler()
f_train_scaled = scaler.fit_transform(f_train_undersampled)
f_test_scaled = scaler.transform(f_test)
decision_tree = DecisionTreeClassifier()
decision_tree.fit(f_train_scaled, t_train_undersampled)
# Predictions on the training set using the final Decision Tree model
y_train_pred_dt = decision_tree.predict(f_train_scaled)
# Probabilities for the positive class on the training set
y_train_prob_dt = decision_tree.predict_proba(f_train_scaled)[:, 1]
# Compute the confusion matrix on the training set for Decision Tree
cm_train_dt = confusion_matrix(t_train_undersampled, y_train_pred_dt)
# Compute accuracy, precision, recall, and F1 score on the training set for Decision Tree
accuracy_train_dt = (cm_train_dt[0, 0] + cm_train_dt[1, 1]) / np.sum(cm_train_dt)
precision_train_dt = precision_score(t_train_undersampled, y_train_pred_dt)
recall_train_dt = recall_score(t_train_undersampled, y_train_pred_dt)
f1_train_dt = f1_score(t_train_undersampled, y_train_pred_dt)
# Compute the ROC AUC score on the training set for Decision Tree
roc_auc_train_dt = roc_auc_score(t_train_undersampled, y_train_prob_dt)
# Compute ROC curve for training set for Decision Tree
fpr_train_dt, tpr_train_dt, _ = roc_curve(t_train_undersampled, y_train_prob_dt)
print("Confusion Matrix (Training Set - Decision Tree):\n", cm_train_dt)
print("Accuracy (Train - Decision Tree):", accuracy_train_dt)
print("Precision (Train - Decision Tree):", precision_train_dt)
print("Recall (Train - Decision Tree):", recall_train_dt)
print("F1 Score (Train - Decision Tree):", f1_train_dt)
print("ROC AUC Score (Train - Decision Tree):", roc_auc_train_dt)
Confusion Matrix (Training Set - Decision Tree): [[20334 0] [ 5 20329]] Accuracy (Train - Decision Tree): 0.9998770532113701 Precision (Train - Decision Tree): 1.0 Recall (Train - Decision Tree): 0.9997541064227402 F1 Score (Train - Decision Tree): 0.9998770380935986 ROC AUC Score (Train - Decision Tree): 0.9999999685589013
X_test_scaled.shape
(106557, 61)
y_test.shape
(20,)
# Predictions on the test set using the final Decision Tree model
y_test_pred_dt = decision_tree.predict(f_test_scaled)
# Probabilities for the positive class on the test set
y_test_prob_dt = decision_tree.predict_proba(f_test_scaled)[:, 1]
# Compute the confusion matrix on the test set for Decision Tree
cm_test_dt = confusion_matrix(t_test, y_test_pred_dt)
# Compute accuracy, precision, recall, and F1 score on the test set for Decision Tree
accuracy_test_dt = (cm_test_dt[0, 0] + cm_test_dt[1, 1]) / np.sum(cm_test_dt)
precision_test_dt = precision_score(t_test, y_test_pred_dt)
recall_test_dt = recall_score(t_test, y_test_pred_dt)
f1_test_dt = f1_score(t_test, y_test_pred_dt)
# Compute the ROC AUC score on the test set for Decision Tree
roc_auc_test_dt = roc_auc_score(t_test, y_test_prob_dt)
print("Confusion Matrix (Test Set - Decision Tree):\n", cm_test_dt)
print("Accuracy (Test - Decision Tree):", accuracy_test_dt)
print("Precision (Test - Decision Tree):", precision_test_dt)
print("Recall (Test - Decision Tree):", recall_test_dt)
print("F1 Score (Test - Decision Tree):", f1_test_dt)
print("ROC AUC Score (Test - Decision Tree):", roc_auc_test_dt)
Confusion Matrix (Test Set - Decision Tree): [[66474 31492] [ 2869 5722]] Accuracy (Test - Decision Tree): 0.6775340897360098 Precision (Test - Decision Tree): 0.1537593378835922 Recall (Test - Decision Tree): 0.6660458619485508 F1 Score (Test - Decision Tree): 0.24984172033620783 ROC AUC Score (Test - Decision Tree): 0.6723178462854968
# Compute ROC curve for test set for Decision Tree
fpr_test_dt, tpr_test_dt, _ = roc_curve(t_test, y_test_prob_dt)
# Plot ROC curve for both training and test sets for Decision Tree
plt.figure(figsize=(10, 5))
plt.plot(fpr_train_dt, tpr_train_dt, color='blue', lw=2, label=f'Train ROC curve (AUC = {roc_auc_train_dt:.2f})')
plt.plot(fpr_test_dt, tpr_test_dt, color='red', lw=2, label=f'Test ROC curve (AUC = {roc_auc_test_dt:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve - Decision Tree')
plt.legend()
plt.show()
## Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(f_train_scaled, t_train_undersampled)
# Predictions on the training set using the final Random Forest model
y_train_pred_rf = rf.predict(f_train_scaled)
# Probabilities for the positive class on the training set
y_train_prob_rf = rf.predict_proba(f_train_scaled)[:, 1]
# Compute the confusion matrix on the training set for Random Forest
cm_train_rf = confusion_matrix(t_train_undersampled, y_train_pred_rf)
# Compute accuracy, precision, recall, and F1 score on the training set for Random Forest
accuracy_train_rf = (cm_train_rf[0, 0] + cm_train_rf[1, 1]) / np.sum(cm_train_rf)
precision_train_rf = precision_score(t_train_undersampled, y_train_pred_rf)
recall_train_rf = recall_score(t_train_undersampled, y_train_pred_rf)
f1_train_rf = f1_score(t_train_undersampled, y_train_pred_rf)
# Compute the ROC AUC score on the training set for Random Forest
roc_auc_train_rf = roc_auc_score(t_train_undersampled, y_train_prob_rf)
# Compute ROC curve for training set for Random Forest
fpr_train_rf, tpr_train_rf, _ = roc_curve(t_train_undersampled, y_train_prob_rf)
print("Confusion Matrix (Training Set - Random Forest):\n", cm_train_rf)
print("Accuracy (Train - Random Forest):", accuracy_train_rf)
print("Precision (Train - Random Forest):", precision_train_rf)
print("Recall (Train - Random Forest):", recall_train_rf)
print("F1 Score (Train - Random Forest):", f1_train_rf)
print("ROC AUC Score (Train - Random Forest):", roc_auc_train_rf)
Confusion Matrix (Training Set - Random Forest): [[20331 3] [ 2 20332]] Accuracy (Train - Random Forest): 0.9998770532113701 Precision (Train - Random Forest): 0.9998524711089255 Recall (Train - Random Forest): 0.9999016425690961 F1 Score (Train - Random Forest): 0.9998770562344783 ROC AUC Score (Train - Random Forest): 0.9999985488723678
# Predictions on the test set using the final Random Forest model
y_test_pred_rf = decision_tree.predict(f_test_scaled)
# Probabilities for the positive class on the test set
y_test_prob_rf = decision_tree.predict_proba(f_test_scaled)[:, 1]
# Compute the confusion matrix on the test set for Random Forest
cm_test_rf = confusion_matrix(t_test, y_test_pred_rf)
# Compute accuracy, precision, recall, and F1 score on the test set for Random Forest
accuracy_test_rf = (cm_test_rf[0, 0] + cm_test_rf[1, 1]) / np.sum(cm_test_rf)
precision_test_rf = precision_score(t_test, y_test_pred_rf)
recall_test_rf = recall_score(t_test, y_test_pred_rf)
f1_test_rf = f1_score(t_test, y_test_pred_rf)
# Compute the ROC AUC score on the test set for Random Forest
roc_auc_test_rf = roc_auc_score(t_test, y_test_prob_rf)
print("Confusion Matrix (Test Set - Random Forest):\n", cm_test_rf)
print("Accuracy (Test - Random Forest):", accuracy_test_rf)
print("Precision (Test - Random Forest):", precision_test_rf)
print("Recall (Test - Random Forest):", recall_test_rf)
print("F1 Score (Test - Random Forest):", f1_test_rf)
print("ROC AUC Score (Test - Random Forest):", roc_auc_test_rf)
Confusion Matrix (Test Set - Random Forest): [[66474 31492] [ 2869 5722]] Accuracy (Test - Random Forest): 0.6775340897360098 Precision (Test - Random Forest): 0.1537593378835922 Recall (Test - Random Forest): 0.6660458619485508 F1 Score (Test - Random Forest): 0.24984172033620783 ROC AUC Score (Test - Random Forest): 0.6723178462854968
# Compute ROC curve for test set for Random Forest
fpr_test_rf, tpr_test_rf, _ = roc_curve(t_test, y_test_prob_rf)
# Plot ROC curve for both training and test sets for Random Forest
plt.figure(figsize=(10, 5))
plt.plot(fpr_train_dt, tpr_train_dt, color='blue', lw=2, label=f'Train ROC curve (AUC = {roc_auc_train_rf:.2f})')
plt.plot(fpr_test_dt, tpr_test_dt, color='red', lw=2, label=f'Test ROC curve (AUC = {roc_auc_test_rf:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve - Random Forest')
plt.legend()
plt.show()
#Feature importance for Decision Tree
coefficients_dt = decision_tree.feature_importances_[0]
# Create a DataFrame to store feature names and their coefficients
feature_coefficients_dt = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': coefficients})
# Sort the DataFrame by absolute coefficient values to identify the most important features
feature_coefficients_dt['Absolute Coefficient'] = np.abs(feature_coefficients_dt['Coefficient'])
feature_coefficients_dt = feature_coefficients_dt.sort_values(by='Absolute Coefficient', ascending=False)
# Print the top features
print("Top features:")
print(feature_coefficients_dt.head(10)) # Adjust the number of top features as needed
# Plot feature coefficients
plt.figure(figsize=(20, 16))
plt.barh(feature_coefficients_dt['Feature'], feature_coefficients_dt['Coefficient'])
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Coefficients')
plt.show()
Top features:
Feature Coefficient Absolute Coefficient
7 REST_AVG_CUR -1.211265 1.211265
25 TURNOVER_DYNAMIC_CUR_1M 0.408601 0.408601
14 TRANS_COUNT_ATM_PRC 0.380209 0.380209
21 REST_DYNAMIC_CUR_1M -0.344919 0.344919
22 REST_AVG_PAYM -0.278476 0.278476
31 REST_DYNAMIC_PAYM_1M -0.236047 0.236047
35 TURNOVER_DYNAMIC_PAYM_1M 0.226137 0.226137
33 CLNT_SETUP_TENOR -0.222930 0.222930
24 REST_DYNAMIC_CUR_3M -0.155333 0.155333
9 TRANS_COUNT_SUP_PRC 0.141405 0.141405
#Feature importance for Random Forest
coefficients_rf = rf.feature_importances_[0]
# Create a DataFrame to store feature names and their coefficients
feature_coefficients_rf = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': coefficients})
# Sort the DataFrame by absolute coefficient values to identify the most important features
feature_coefficients_rf['Absolute Coefficient'] = np.abs(feature_coefficients_rf['Coefficient'])
feature_coefficients_rf = feature_coefficients_rf.sort_values(by='Absolute Coefficient', ascending=False)
# Print the top features
print("Top features:")
print(feature_coefficients_rf.head(10)) # Adjust the number of top features as needed
# Plot feature coefficients
plt.figure(figsize=(20, 16))
plt.barh(feature_coefficients_rf['Feature'], feature_coefficients_rf['Coefficient'])
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Coefficients')
plt.show()
Top features:
Feature Coefficient Absolute Coefficient
7 REST_AVG_CUR -1.211265 1.211265
25 TURNOVER_DYNAMIC_CUR_1M 0.408601 0.408601
14 TRANS_COUNT_ATM_PRC 0.380209 0.380209
21 REST_DYNAMIC_CUR_1M -0.344919 0.344919
22 REST_AVG_PAYM -0.278476 0.278476
31 REST_DYNAMIC_PAYM_1M -0.236047 0.236047
35 TURNOVER_DYNAMIC_PAYM_1M 0.226137 0.226137
33 CLNT_SETUP_TENOR -0.222930 0.222930
24 REST_DYNAMIC_CUR_3M -0.155333 0.155333
9 TRANS_COUNT_SUP_PRC 0.141405 0.141405
print(df.columns)
# Plot feature importance
plt.figure(figsize=(10, 6))
# plt.barh(df.columns, decision_tree.feature_importances_, color='blue', label='Decision Tree')
# plt.barh(df.columns, rf.feature_importances_, color='green', alpha=0.5, label='Random Forest')
plt.barh(feature_coefficients_dt['Feature'], feature_coefficients_dt['Coefficient'], color='blue', label='Decision Tree')
plt.barh(feature_coefficients_rf['Feature'], feature_coefficients_rf['Coefficient'], color='green', alpha=0.5, label='Random Forest')
plt.xlabel('Importance Score')
plt.title('Feature Importance Comparison')
plt.legend()
plt.tight_layout()
plt.show()
Index(['CR_PROD_CNT_IL', 'AMOUNT_RUB_CLO_PRC', 'TURNOVER_DYNAMIC_IL_1M',
'AMOUNT_RUB_SUP_PRC', 'REST_DYNAMIC_FDEP_1M', 'REST_DYNAMIC_SAVE_3M',
'CR_PROD_CNT_VCU', 'REST_AVG_CUR', 'AMOUNT_RUB_NAS_PRC',
'TRANS_COUNT_SUP_PRC', 'TRANS_COUNT_NAS_PRC', 'CR_PROD_CNT_TOVR',
'CR_PROD_CNT_PIL', 'TURNOVER_CC', 'TRANS_COUNT_ATM_PRC',
'AMOUNT_RUB_ATM_PRC', 'TURNOVER_PAYM', 'CR_PROD_CNT_CC',
'REST_DYNAMIC_FDEP_3M', 'REST_DYNAMIC_IL_1M', 'CR_PROD_CNT_CCFP',
'REST_DYNAMIC_CUR_1M', 'REST_AVG_PAYM', 'LDEAL_GRACE_DAYS_PCT_MED',
'REST_DYNAMIC_CUR_3M', 'TURNOVER_DYNAMIC_CUR_1M',
'REST_DYNAMIC_PAYM_3M', 'REST_DYNAMIC_IL_3M', 'CNT_TRAN_ATM_TENDENCY3M',
'TURNOVER_DYNAMIC_IL_3M', 'SUM_TRAN_ATM_TENDENCY3M',
'REST_DYNAMIC_PAYM_1M', 'TURNOVER_DYNAMIC_CUR_3M', 'CLNT_SETUP_TENOR',
'TURNOVER_DYNAMIC_PAYM_3M', 'TURNOVER_DYNAMIC_PAYM_1M',
'TRANS_AMOUNT_TENDENCY3M', 'TRANS_CNT_TENDENCY3M', 'REST_DYNAMIC_CC_1M',
'TURNOVER_DYNAMIC_CC_1M', 'REST_DYNAMIC_CC_3M',
'TURNOVER_DYNAMIC_CC_3M', 'TARGET', 'PACK_101', 'PACK_102', 'PACK_103',
'PACK_104', 'PACK_105', 'PACK_107', 'PACK_108', 'PACK_109', 'PACK_301',
'PACK_K01', 'PACK_M01', 'PACK_O01', 'total_turnover', 'TOTAL_PROD_CNT',
'AGE_GROUP_Group 1', 'AGE_GROUP_Group 2', 'AGE_GROUP_Group 3',
'AGE_GROUP_Group 4', 'AGE_GROUP_Group 5'],
dtype='object')
#empirical tuning using grid search for decision tree
# Define hyperparameters grid for Decision Tree
param_grid_dt = {
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2],
'max_features': ['auto', 'sqrt']
}
# Initialize DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()
# Perform Grid Search
grid_search_dt = GridSearchCV(estimator=dt_classifier, param_grid=param_grid_dt, cv=5)
grid_search_dt.fit(f_train_scaled, t_train_undersampled)
# Print best parameters and best score for Decision Tree
print("Best Parameters for Decision Tree:", grid_search_dt.best_params_)
print("Best Score for Decision Tree:", grid_search_dt.best_score_)
# Predictions on the training set using the Decision Tree model using Grid Search
y_train_pred_grdt = grid_search_dt.predict(f_train_scaled)
# Probabilities for the positive class on the training set
y_train_prob_grdt = grid_search_dt.predict_proba(f_train_scaled)[:, 1]
# Compute the confusion matrix on the training set for Decision Tree using Grid Search
cm_train_grdt = confusion_matrix(t_train_undersampled, y_train_pred_grdt)
# Compute accuracy, precision, recall, and F1 score on the training set for Decision Tree using Grid Search
accuracy_train_grdt = (cm_train_grdt[0, 0] + cm_train_grdt[1, 1]) / np.sum(cm_train_grdt)
precision_train_grdt = precision_score(t_train_undersampled, y_train_pred_grdt)
recall_train_grdt = recall_score(t_train_undersampled, y_train_pred_grdt)
f1_train_grdt = f1_score(t_train_undersampled, y_train_pred_grdt)
# Compute the ROC AUC score on the training set for Decision Tree using Grid Search
roc_auc_train_grdt = roc_auc_score(t_train_undersampled, y_train_prob_grdt)
# Compute ROC curve for training set for Decision Tree using Grid Search
fpr_train_grdt, tpr_train_grdt, _ = roc_curve(t_train_undersampled, y_train_prob_grdt)
print("Confusion Matrix (Training Set):\n", cm_train_grdt)
print("Accuracy (Train):", accuracy_train_grdt)
print("Precision (Train):", precision_train_grdt)
print("Recall (Train):", recall_train_grdt)
print("F1 Score (Train):", f1_train_grdt)
print("ROC AUC Score (Train):", roc_auc_train_grdt)
# Predictions on the test set using the final Decision Tree model using Grid Search
y_test_pred_grdt = grid_search_dt.predict(f_test_scaled)
# Probabilities for the positive class on the test set
y_test_prob_grdt = grid_search_dt.predict_proba(f_test_scaled)[:, 1]
# Compute the confusion matrix on the test set for Decision Tree using Grid Search
cm_test_grdt = confusion_matrix(t_test, y_test_pred_grdt)
# Compute accuracy, precision, recall, and F1 score on the test set for Decision Tree using Grid Search
accuracy_test_grdt = (cm_test_grdt[0, 0] + cm_test_grdt[1, 1]) / np.sum(cm_test_grdt)
precision_test_grdt = precision_score(t_test, y_test_pred_grdt)
recall_test_grdt = recall_score(t_test, y_test_pred_grdt)
f1_test_grdt = f1_score(t_test, y_test_pred_grdt)
# Compute the ROC AUC score on the test set for Decision Tree using Grid Search
roc_auc_test_grdt = roc_auc_score(t_test, y_test_prob_grdt)
print("Confusion Matrix (Test Set):\n", cm_test_grdt)
print("Accuracy (Test):", accuracy_test_grdt)
print("Precision (Test):", precision_test_grdt)
print("Recall (Test):", recall_test_grdt)
print("F1 Score (Test):", f1_test_grdt)
print("ROC AUC Score (Test):", roc_auc_test_grdt)
# Compute ROC curve for test set for Decision Tree using Grid Search
fpr_test_grdt, tpr_test_grdt, _ = roc_curve(t_test, y_test_prob_grdt)
# Plot ROC curve for both training and test sets for Decision Tree using Grid Search
plt.figure(figsize=(10, 5))
plt.plot(fpr_train_grdt, tpr_train_grdt, color='blue', lw=2, label=f'Train ROC curve (AUC = {roc_auc_train_grdt:.2f})')
plt.plot(fpr_test_grdt, tpr_test_grdt, color='red', lw=2, label=f'Test ROC curve (AUC = {roc_auc_test_grdt:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
Best Parameters for Decision Tree: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Score for Decision Tree: 0.6973294725133191
Confusion Matrix (Training Set):
[[13294 7040]
[ 3678 16656]]
Accuracy (Train): 0.7364512638929871
Precision (Train): 0.7029034436191762
Recall (Train): 0.8191206845677191
F1 Score (Train): 0.7565750624574153
ROC AUC Score (Train): 0.8120586207869641
Confusion Matrix (Test Set):
[[60419 37547]
[ 1822 6769]]
Accuracy (Test): 0.6305357695881078
Precision (Test): 0.1527439299575774
Recall (Test): 0.7879175881736701
F1 Score (Test): 0.2558829644470486
ROC AUC Score (Test): 0.7565873346583987
# Define hyperparameters grid for Random Forest
param_grid_dt = {
'n_estimators': [50],
'max_depth': [10],
'min_samples_split': [5],
'min_samples_leaf': [2],
'max_features': ['auto', 'sqrt']
}
# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier()
# Perform Grid Search
grid_search_rf = GridSearchCV(estimator=rf_classifier, param_grid=param_grid_dt, cv=5)
grid_search_rf.fit(f_train_scaled, t_train_undersampled)
# Print best parameters and best score for Random Forest
print("Best Parameters for Decision Tree:", grid_search_rf.best_params_)
print("Best Score for Decision Tree:", grid_search_rf.best_score_)
# Predictions on the training set using the Random Forest model using Grid Search
y_train_pred_grrf = grid_search_rf.predict(f_train_scaled)
# Probabilities for the positive class on the training set
y_train_prob_grrf = grid_search_rf.predict_proba(f_train_scaled)[:, 1]
# Compute the confusion matrix on the training set for Random Forest using Grid Search
cm_train_grrf = confusion_matrix(t_train_undersampled, y_train_pred_grrf)
# Compute accuracy, precision, recall, and F1 score on the training set for Random Forest using Grid Search
accuracy_train_grrf = (cm_train_grrf[0, 0] + cm_train_grrf[1, 1]) / np.sum(cm_train_grrf)
precision_train_grrf = precision_score(t_train_undersampled, y_train_pred_grrf)
recall_train_grrf = recall_score(t_train_undersampled, y_train_pred_grrf)
f1_train_grrf = f1_score(t_train_undersampled, y_train_pred_grrf)
# Compute the ROC AUC score on the training set for Random Forest using Grid Search
roc_auc_train_grrf = roc_auc_score(t_train_undersampled, y_train_prob_grrf)
# Compute ROC curve for training set for Random Forest using Grid Search
fpr_train_grrf, tpr_train_grrf, _ = roc_curve(t_train_undersampled, y_train_prob_grrf)
print("Confusion Matrix (Training Set):\n", cm_train_grrf)
print("Accuracy (Train):", accuracy_train_grrf)
print("Precision (Train):", precision_train_grrf)
print("Recall (Train):", recall_train_grrf)
print("F1 Score (Train):", f1_train_grrf)
print("ROC AUC Score (Train):", roc_auc_train_grrf)
# Predictions on the test set using the final Random Forest model using Grid Search
y_test_pred_grrf = grid_search_rf.predict(f_test_scaled)
# Probabilities for the positive class on the test set
y_test_prob_grrf = grid_search_rf.predict_proba(f_test_scaled)[:, 1]
# Compute the confusion matrix on the test set for Random Forest using Grid Search
cm_test_grrf = confusion_matrix(t_test, y_test_pred_grrf)
# Compute accuracy, precision, recall, and F1 score on the test set for Random Forest using Grid Search
accuracy_test_grrf = (cm_test_grrf[0, 0] + cm_test_grrf[1, 1]) / np.sum(cm_test_grrf)
precision_test_grrf = precision_score(t_test, y_test_pred_grrf)
recall_test_grrf = recall_score(t_test, y_test_pred_grrf)
f1_test_grrf = f1_score(t_test, y_test_pred_grrf)
# Compute the ROC AUC score on the test set for Random Forest using Grid Search
roc_auc_test_grrf = roc_auc_score(t_test, y_test_prob_grrf)
print("Confusion Matrix (Test Set):\n", cm_test_grrf)
print("Accuracy (Test):", accuracy_test_grrf)
print("Precision (Test):", precision_test_grrf)
print("Recall (Test):", recall_test_grrf)
print("F1 Score (Test):", f1_test_grrf)
print("ROC AUC Score (Test):", roc_auc_test_grrf)
# Compute ROC curve for test set for Random Forest using Grid Search
fpr_test_grrf, tpr_test_grrf, _ = roc_curve(t_test, y_test_prob_grrf)
# Plot ROC curve for both training and test sets for Random Forest using Grid Search
plt.figure(figsize=(10, 5))
plt.plot(fpr_train_grrf, tpr_train_grrf, color='blue', lw=2, label=f'Train ROC curve (AUC = {roc_auc_train_grrf:.2f})')
plt.plot(fpr_test_grrf, tpr_test_grrf, color='red', lw=2, label=f'Test ROC curve (AUC = {roc_auc_test_grrf:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
Best Parameters for Decision Tree: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Best Score for Decision Tree: 0.7466066707377845
Confusion Matrix (Training Set):
[[14855 5479]
[ 3077 17257]]
Accuracy (Train): 0.7896134552965477
Precision (Train): 0.7590165376495426
Recall (Train): 0.8486770925543424
F1 Score (Train): 0.8013466449965173
ROC AUC Score (Train): 0.8688035132941526
Confusion Matrix (Test Set):
[[66241 31725]
[ 1659 6932]]
Accuracy (Test): 0.6867028914102311
Precision (Test): 0.17932069224202601
Recall (Test): 0.806890932371086
F1 Score (Test): 0.2934304097527938
ROC AUC Score (Test): 0.8110961011696805
#Ramdom Forest is giving good accuracy for test data after doing hyper perameter tunning